math-stratos-unverified-scaled-1 / trainer_state.json
sedrickkeh's picture
End of training
2ea91c9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.998922800718133,
"eval_steps": 500,
"global_step": 2784,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010771992818671453,
"grad_norm": 5.9528489112854,
"learning_rate": 3.5842293906810036e-08,
"loss": 0.8283,
"step": 1
},
{
"epoch": 0.0021543985637342907,
"grad_norm": 6.202611446380615,
"learning_rate": 7.168458781362007e-08,
"loss": 0.8397,
"step": 2
},
{
"epoch": 0.003231597845601436,
"grad_norm": 6.138794898986816,
"learning_rate": 1.0752688172043012e-07,
"loss": 0.8405,
"step": 3
},
{
"epoch": 0.004308797127468581,
"grad_norm": 6.084003448486328,
"learning_rate": 1.4336917562724014e-07,
"loss": 0.8372,
"step": 4
},
{
"epoch": 0.005385996409335727,
"grad_norm": 6.136441707611084,
"learning_rate": 1.7921146953405018e-07,
"loss": 0.8356,
"step": 5
},
{
"epoch": 0.006463195691202872,
"grad_norm": 6.009083271026611,
"learning_rate": 2.1505376344086024e-07,
"loss": 0.8111,
"step": 6
},
{
"epoch": 0.0075403949730700175,
"grad_norm": 5.893197059631348,
"learning_rate": 2.508960573476703e-07,
"loss": 0.7935,
"step": 7
},
{
"epoch": 0.008617594254937163,
"grad_norm": 6.211028575897217,
"learning_rate": 2.867383512544803e-07,
"loss": 0.8436,
"step": 8
},
{
"epoch": 0.00969479353680431,
"grad_norm": 6.155322551727295,
"learning_rate": 3.2258064516129035e-07,
"loss": 0.857,
"step": 9
},
{
"epoch": 0.010771992818671455,
"grad_norm": 6.04060697555542,
"learning_rate": 3.5842293906810036e-07,
"loss": 0.8241,
"step": 10
},
{
"epoch": 0.0118491921005386,
"grad_norm": 5.9382829666137695,
"learning_rate": 3.942652329749104e-07,
"loss": 0.8224,
"step": 11
},
{
"epoch": 0.012926391382405745,
"grad_norm": 5.773810386657715,
"learning_rate": 4.301075268817205e-07,
"loss": 0.8267,
"step": 12
},
{
"epoch": 0.01400359066427289,
"grad_norm": 5.450674057006836,
"learning_rate": 4.6594982078853055e-07,
"loss": 0.7886,
"step": 13
},
{
"epoch": 0.015080789946140035,
"grad_norm": 5.492709636688232,
"learning_rate": 5.017921146953406e-07,
"loss": 0.7966,
"step": 14
},
{
"epoch": 0.01615798922800718,
"grad_norm": 5.319929122924805,
"learning_rate": 5.376344086021506e-07,
"loss": 0.7823,
"step": 15
},
{
"epoch": 0.017235188509874325,
"grad_norm": 5.306995868682861,
"learning_rate": 5.734767025089606e-07,
"loss": 0.7923,
"step": 16
},
{
"epoch": 0.018312387791741474,
"grad_norm": 4.499080657958984,
"learning_rate": 6.093189964157707e-07,
"loss": 0.7781,
"step": 17
},
{
"epoch": 0.01938958707360862,
"grad_norm": 4.392838001251221,
"learning_rate": 6.451612903225807e-07,
"loss": 0.786,
"step": 18
},
{
"epoch": 0.020466786355475764,
"grad_norm": 4.141458034515381,
"learning_rate": 6.810035842293908e-07,
"loss": 0.7563,
"step": 19
},
{
"epoch": 0.02154398563734291,
"grad_norm": 4.173211574554443,
"learning_rate": 7.168458781362007e-07,
"loss": 0.7871,
"step": 20
},
{
"epoch": 0.022621184919210054,
"grad_norm": 3.9070355892181396,
"learning_rate": 7.526881720430108e-07,
"loss": 0.7424,
"step": 21
},
{
"epoch": 0.0236983842010772,
"grad_norm": 3.5647006034851074,
"learning_rate": 7.885304659498208e-07,
"loss": 0.7427,
"step": 22
},
{
"epoch": 0.024775583482944345,
"grad_norm": 2.366295099258423,
"learning_rate": 8.243727598566309e-07,
"loss": 0.6975,
"step": 23
},
{
"epoch": 0.02585278276481149,
"grad_norm": 2.3025567531585693,
"learning_rate": 8.60215053763441e-07,
"loss": 0.7618,
"step": 24
},
{
"epoch": 0.026929982046678635,
"grad_norm": 2.1890130043029785,
"learning_rate": 8.96057347670251e-07,
"loss": 0.7025,
"step": 25
},
{
"epoch": 0.02800718132854578,
"grad_norm": 2.134795665740967,
"learning_rate": 9.318996415770611e-07,
"loss": 0.7343,
"step": 26
},
{
"epoch": 0.029084380610412925,
"grad_norm": 1.9905577898025513,
"learning_rate": 9.67741935483871e-07,
"loss": 0.7293,
"step": 27
},
{
"epoch": 0.03016157989228007,
"grad_norm": 1.9130618572235107,
"learning_rate": 1.0035842293906811e-06,
"loss": 0.6949,
"step": 28
},
{
"epoch": 0.03123877917414722,
"grad_norm": 1.8031822443008423,
"learning_rate": 1.039426523297491e-06,
"loss": 0.7427,
"step": 29
},
{
"epoch": 0.03231597845601436,
"grad_norm": 1.4684985876083374,
"learning_rate": 1.0752688172043011e-06,
"loss": 0.7062,
"step": 30
},
{
"epoch": 0.03339317773788151,
"grad_norm": 1.7485400438308716,
"learning_rate": 1.111111111111111e-06,
"loss": 0.6973,
"step": 31
},
{
"epoch": 0.03447037701974865,
"grad_norm": 2.080551862716675,
"learning_rate": 1.1469534050179212e-06,
"loss": 0.667,
"step": 32
},
{
"epoch": 0.0355475763016158,
"grad_norm": 2.273972749710083,
"learning_rate": 1.1827956989247313e-06,
"loss": 0.6813,
"step": 33
},
{
"epoch": 0.03662477558348295,
"grad_norm": 2.2604429721832275,
"learning_rate": 1.2186379928315414e-06,
"loss": 0.6723,
"step": 34
},
{
"epoch": 0.03770197486535009,
"grad_norm": 2.1238536834716797,
"learning_rate": 1.2544802867383513e-06,
"loss": 0.6494,
"step": 35
},
{
"epoch": 0.03877917414721724,
"grad_norm": 2.0893118381500244,
"learning_rate": 1.2903225806451614e-06,
"loss": 0.6818,
"step": 36
},
{
"epoch": 0.03985637342908438,
"grad_norm": 2.0013720989227295,
"learning_rate": 1.3261648745519715e-06,
"loss": 0.6687,
"step": 37
},
{
"epoch": 0.04093357271095153,
"grad_norm": 1.5645451545715332,
"learning_rate": 1.3620071684587816e-06,
"loss": 0.6511,
"step": 38
},
{
"epoch": 0.04201077199281867,
"grad_norm": 1.2635418176651,
"learning_rate": 1.3978494623655913e-06,
"loss": 0.6309,
"step": 39
},
{
"epoch": 0.04308797127468582,
"grad_norm": 1.1290279626846313,
"learning_rate": 1.4336917562724014e-06,
"loss": 0.6537,
"step": 40
},
{
"epoch": 0.04416517055655296,
"grad_norm": 0.8919850587844849,
"learning_rate": 1.4695340501792116e-06,
"loss": 0.6563,
"step": 41
},
{
"epoch": 0.04524236983842011,
"grad_norm": 0.9924504160881042,
"learning_rate": 1.5053763440860217e-06,
"loss": 0.6477,
"step": 42
},
{
"epoch": 0.04631956912028725,
"grad_norm": 1.03560471534729,
"learning_rate": 1.5412186379928318e-06,
"loss": 0.6306,
"step": 43
},
{
"epoch": 0.0473967684021544,
"grad_norm": 1.0033239126205444,
"learning_rate": 1.5770609318996417e-06,
"loss": 0.5894,
"step": 44
},
{
"epoch": 0.04847396768402154,
"grad_norm": 1.0003079175949097,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.6201,
"step": 45
},
{
"epoch": 0.04955116696588869,
"grad_norm": 0.8706744909286499,
"learning_rate": 1.6487455197132617e-06,
"loss": 0.6225,
"step": 46
},
{
"epoch": 0.05062836624775584,
"grad_norm": 0.773290753364563,
"learning_rate": 1.6845878136200718e-06,
"loss": 0.6072,
"step": 47
},
{
"epoch": 0.05170556552962298,
"grad_norm": 0.6847248673439026,
"learning_rate": 1.720430107526882e-06,
"loss": 0.6025,
"step": 48
},
{
"epoch": 0.05278276481149013,
"grad_norm": 0.7232184410095215,
"learning_rate": 1.7562724014336918e-06,
"loss": 0.5965,
"step": 49
},
{
"epoch": 0.05385996409335727,
"grad_norm": 0.6927616000175476,
"learning_rate": 1.792114695340502e-06,
"loss": 0.5922,
"step": 50
},
{
"epoch": 0.05493716337522442,
"grad_norm": 0.637067973613739,
"learning_rate": 1.827956989247312e-06,
"loss": 0.5962,
"step": 51
},
{
"epoch": 0.05601436265709156,
"grad_norm": 0.6449267864227295,
"learning_rate": 1.8637992831541222e-06,
"loss": 0.6068,
"step": 52
},
{
"epoch": 0.05709156193895871,
"grad_norm": 0.6247251629829407,
"learning_rate": 1.8996415770609319e-06,
"loss": 0.5889,
"step": 53
},
{
"epoch": 0.05816876122082585,
"grad_norm": 0.5862783789634705,
"learning_rate": 1.935483870967742e-06,
"loss": 0.6081,
"step": 54
},
{
"epoch": 0.059245960502693,
"grad_norm": 0.6185500621795654,
"learning_rate": 1.9713261648745523e-06,
"loss": 0.6203,
"step": 55
},
{
"epoch": 0.06032315978456014,
"grad_norm": 0.5183306336402893,
"learning_rate": 2.0071684587813622e-06,
"loss": 0.5865,
"step": 56
},
{
"epoch": 0.06140035906642729,
"grad_norm": 0.5492181777954102,
"learning_rate": 2.043010752688172e-06,
"loss": 0.5726,
"step": 57
},
{
"epoch": 0.06247755834829444,
"grad_norm": 0.49654823541641235,
"learning_rate": 2.078853046594982e-06,
"loss": 0.5994,
"step": 58
},
{
"epoch": 0.06355475763016158,
"grad_norm": 0.5319607853889465,
"learning_rate": 2.1146953405017924e-06,
"loss": 0.5971,
"step": 59
},
{
"epoch": 0.06463195691202872,
"grad_norm": 0.5596228837966919,
"learning_rate": 2.1505376344086023e-06,
"loss": 0.5693,
"step": 60
},
{
"epoch": 0.06570915619389588,
"grad_norm": 0.478316068649292,
"learning_rate": 2.1863799283154126e-06,
"loss": 0.575,
"step": 61
},
{
"epoch": 0.06678635547576302,
"grad_norm": 0.4188225269317627,
"learning_rate": 2.222222222222222e-06,
"loss": 0.5606,
"step": 62
},
{
"epoch": 0.06786355475763016,
"grad_norm": 0.45341843366622925,
"learning_rate": 2.2580645161290324e-06,
"loss": 0.5432,
"step": 63
},
{
"epoch": 0.0689407540394973,
"grad_norm": 0.46085870265960693,
"learning_rate": 2.2939068100358423e-06,
"loss": 0.5684,
"step": 64
},
{
"epoch": 0.07001795332136446,
"grad_norm": 0.4461316466331482,
"learning_rate": 2.3297491039426526e-06,
"loss": 0.5609,
"step": 65
},
{
"epoch": 0.0710951526032316,
"grad_norm": 0.4758760929107666,
"learning_rate": 2.3655913978494625e-06,
"loss": 0.5572,
"step": 66
},
{
"epoch": 0.07217235188509874,
"grad_norm": 0.5010541081428528,
"learning_rate": 2.4014336917562724e-06,
"loss": 0.5853,
"step": 67
},
{
"epoch": 0.0732495511669659,
"grad_norm": 0.4104984402656555,
"learning_rate": 2.4372759856630828e-06,
"loss": 0.5486,
"step": 68
},
{
"epoch": 0.07432675044883304,
"grad_norm": 0.41943588852882385,
"learning_rate": 2.4731182795698927e-06,
"loss": 0.5896,
"step": 69
},
{
"epoch": 0.07540394973070018,
"grad_norm": 0.46157947182655334,
"learning_rate": 2.5089605734767026e-06,
"loss": 0.5513,
"step": 70
},
{
"epoch": 0.07648114901256732,
"grad_norm": 0.39336729049682617,
"learning_rate": 2.544802867383513e-06,
"loss": 0.5367,
"step": 71
},
{
"epoch": 0.07755834829443448,
"grad_norm": 0.46617960929870605,
"learning_rate": 2.580645161290323e-06,
"loss": 0.5795,
"step": 72
},
{
"epoch": 0.07863554757630162,
"grad_norm": 0.3717260956764221,
"learning_rate": 2.616487455197133e-06,
"loss": 0.5363,
"step": 73
},
{
"epoch": 0.07971274685816876,
"grad_norm": 0.4486253261566162,
"learning_rate": 2.652329749103943e-06,
"loss": 0.5446,
"step": 74
},
{
"epoch": 0.0807899461400359,
"grad_norm": 0.3776644468307495,
"learning_rate": 2.688172043010753e-06,
"loss": 0.5549,
"step": 75
},
{
"epoch": 0.08186714542190306,
"grad_norm": 0.4512787163257599,
"learning_rate": 2.7240143369175633e-06,
"loss": 0.5519,
"step": 76
},
{
"epoch": 0.0829443447037702,
"grad_norm": 0.44562798738479614,
"learning_rate": 2.7598566308243727e-06,
"loss": 0.5542,
"step": 77
},
{
"epoch": 0.08402154398563734,
"grad_norm": 0.41436710953712463,
"learning_rate": 2.7956989247311827e-06,
"loss": 0.5381,
"step": 78
},
{
"epoch": 0.0850987432675045,
"grad_norm": 0.39803346991539,
"learning_rate": 2.831541218637993e-06,
"loss": 0.5214,
"step": 79
},
{
"epoch": 0.08617594254937164,
"grad_norm": 0.41391584277153015,
"learning_rate": 2.867383512544803e-06,
"loss": 0.5379,
"step": 80
},
{
"epoch": 0.08725314183123878,
"grad_norm": 0.4134213924407959,
"learning_rate": 2.903225806451613e-06,
"loss": 0.5846,
"step": 81
},
{
"epoch": 0.08833034111310592,
"grad_norm": 0.4272231459617615,
"learning_rate": 2.939068100358423e-06,
"loss": 0.5338,
"step": 82
},
{
"epoch": 0.08940754039497308,
"grad_norm": 0.46061354875564575,
"learning_rate": 2.974910394265233e-06,
"loss": 0.5532,
"step": 83
},
{
"epoch": 0.09048473967684022,
"grad_norm": 0.4512515664100647,
"learning_rate": 3.0107526881720433e-06,
"loss": 0.5447,
"step": 84
},
{
"epoch": 0.09156193895870736,
"grad_norm": 0.40919914841651917,
"learning_rate": 3.0465949820788532e-06,
"loss": 0.531,
"step": 85
},
{
"epoch": 0.0926391382405745,
"grad_norm": 0.3911254405975342,
"learning_rate": 3.0824372759856636e-06,
"loss": 0.5511,
"step": 86
},
{
"epoch": 0.09371633752244166,
"grad_norm": 0.40862536430358887,
"learning_rate": 3.1182795698924735e-06,
"loss": 0.5295,
"step": 87
},
{
"epoch": 0.0947935368043088,
"grad_norm": 0.352711021900177,
"learning_rate": 3.1541218637992834e-06,
"loss": 0.5334,
"step": 88
},
{
"epoch": 0.09587073608617594,
"grad_norm": 0.3540276288986206,
"learning_rate": 3.1899641577060937e-06,
"loss": 0.5136,
"step": 89
},
{
"epoch": 0.09694793536804308,
"grad_norm": 0.3934048116207123,
"learning_rate": 3.225806451612903e-06,
"loss": 0.5266,
"step": 90
},
{
"epoch": 0.09802513464991024,
"grad_norm": 0.4379293620586395,
"learning_rate": 3.261648745519714e-06,
"loss": 0.5493,
"step": 91
},
{
"epoch": 0.09910233393177738,
"grad_norm": 0.3995460271835327,
"learning_rate": 3.2974910394265234e-06,
"loss": 0.5288,
"step": 92
},
{
"epoch": 0.10017953321364452,
"grad_norm": 0.41289108991622925,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.5408,
"step": 93
},
{
"epoch": 0.10125673249551168,
"grad_norm": 0.34702348709106445,
"learning_rate": 3.3691756272401437e-06,
"loss": 0.528,
"step": 94
},
{
"epoch": 0.10233393177737882,
"grad_norm": 0.3608455955982208,
"learning_rate": 3.4050179211469536e-06,
"loss": 0.4823,
"step": 95
},
{
"epoch": 0.10341113105924596,
"grad_norm": 0.4204634428024292,
"learning_rate": 3.440860215053764e-06,
"loss": 0.565,
"step": 96
},
{
"epoch": 0.1044883303411131,
"grad_norm": 0.3752126395702362,
"learning_rate": 3.4767025089605738e-06,
"loss": 0.5236,
"step": 97
},
{
"epoch": 0.10556552962298026,
"grad_norm": 0.4099592864513397,
"learning_rate": 3.5125448028673837e-06,
"loss": 0.5262,
"step": 98
},
{
"epoch": 0.1066427289048474,
"grad_norm": 0.3806130886077881,
"learning_rate": 3.548387096774194e-06,
"loss": 0.5216,
"step": 99
},
{
"epoch": 0.10771992818671454,
"grad_norm": 0.3793143928050995,
"learning_rate": 3.584229390681004e-06,
"loss": 0.541,
"step": 100
},
{
"epoch": 0.10879712746858168,
"grad_norm": 0.35247287154197693,
"learning_rate": 3.620071684587814e-06,
"loss": 0.5112,
"step": 101
},
{
"epoch": 0.10987432675044884,
"grad_norm": 0.39345964789390564,
"learning_rate": 3.655913978494624e-06,
"loss": 0.5175,
"step": 102
},
{
"epoch": 0.11095152603231598,
"grad_norm": 0.3889741003513336,
"learning_rate": 3.691756272401434e-06,
"loss": 0.5331,
"step": 103
},
{
"epoch": 0.11202872531418312,
"grad_norm": 0.38461363315582275,
"learning_rate": 3.7275985663082444e-06,
"loss": 0.5457,
"step": 104
},
{
"epoch": 0.11310592459605028,
"grad_norm": 0.3945530354976654,
"learning_rate": 3.763440860215054e-06,
"loss": 0.4938,
"step": 105
},
{
"epoch": 0.11418312387791742,
"grad_norm": 0.37691155076026917,
"learning_rate": 3.7992831541218638e-06,
"loss": 0.5173,
"step": 106
},
{
"epoch": 0.11526032315978456,
"grad_norm": 0.3901672661304474,
"learning_rate": 3.8351254480286745e-06,
"loss": 0.5205,
"step": 107
},
{
"epoch": 0.1163375224416517,
"grad_norm": 0.4444199800491333,
"learning_rate": 3.870967741935484e-06,
"loss": 0.5323,
"step": 108
},
{
"epoch": 0.11741472172351886,
"grad_norm": 0.3674546778202057,
"learning_rate": 3.906810035842294e-06,
"loss": 0.5431,
"step": 109
},
{
"epoch": 0.118491921005386,
"grad_norm": 0.3770754635334015,
"learning_rate": 3.942652329749105e-06,
"loss": 0.533,
"step": 110
},
{
"epoch": 0.11956912028725314,
"grad_norm": 0.3679567277431488,
"learning_rate": 3.978494623655914e-06,
"loss": 0.4874,
"step": 111
},
{
"epoch": 0.12064631956912028,
"grad_norm": 0.40148189663887024,
"learning_rate": 4.0143369175627245e-06,
"loss": 0.5143,
"step": 112
},
{
"epoch": 0.12172351885098744,
"grad_norm": 0.4085795283317566,
"learning_rate": 4.050179211469534e-06,
"loss": 0.5196,
"step": 113
},
{
"epoch": 0.12280071813285458,
"grad_norm": 0.3965912163257599,
"learning_rate": 4.086021505376344e-06,
"loss": 0.5159,
"step": 114
},
{
"epoch": 0.12387791741472172,
"grad_norm": 0.36728113889694214,
"learning_rate": 4.121863799283155e-06,
"loss": 0.5193,
"step": 115
},
{
"epoch": 0.12495511669658887,
"grad_norm": 0.4603145122528076,
"learning_rate": 4.157706093189964e-06,
"loss": 0.5198,
"step": 116
},
{
"epoch": 0.12603231597845602,
"grad_norm": 0.3981267511844635,
"learning_rate": 4.193548387096774e-06,
"loss": 0.509,
"step": 117
},
{
"epoch": 0.12710951526032316,
"grad_norm": 0.3856450915336609,
"learning_rate": 4.229390681003585e-06,
"loss": 0.5195,
"step": 118
},
{
"epoch": 0.1281867145421903,
"grad_norm": 0.35274478793144226,
"learning_rate": 4.265232974910394e-06,
"loss": 0.5232,
"step": 119
},
{
"epoch": 0.12926391382405744,
"grad_norm": 0.35597339272499084,
"learning_rate": 4.3010752688172045e-06,
"loss": 0.5078,
"step": 120
},
{
"epoch": 0.13034111310592458,
"grad_norm": 0.395088791847229,
"learning_rate": 4.336917562724015e-06,
"loss": 0.5084,
"step": 121
},
{
"epoch": 0.13141831238779175,
"grad_norm": 0.4228588044643402,
"learning_rate": 4.372759856630825e-06,
"loss": 0.503,
"step": 122
},
{
"epoch": 0.1324955116696589,
"grad_norm": 0.4004088342189789,
"learning_rate": 4.408602150537635e-06,
"loss": 0.4904,
"step": 123
},
{
"epoch": 0.13357271095152604,
"grad_norm": 0.4043223559856415,
"learning_rate": 4.444444444444444e-06,
"loss": 0.5073,
"step": 124
},
{
"epoch": 0.13464991023339318,
"grad_norm": 0.41619932651519775,
"learning_rate": 4.480286738351255e-06,
"loss": 0.5268,
"step": 125
},
{
"epoch": 0.13572710951526032,
"grad_norm": 0.4488072395324707,
"learning_rate": 4.516129032258065e-06,
"loss": 0.4877,
"step": 126
},
{
"epoch": 0.13680430879712746,
"grad_norm": 0.4372597932815552,
"learning_rate": 4.551971326164875e-06,
"loss": 0.5501,
"step": 127
},
{
"epoch": 0.1378815080789946,
"grad_norm": 0.47266843914985657,
"learning_rate": 4.587813620071685e-06,
"loss": 0.5074,
"step": 128
},
{
"epoch": 0.13895870736086177,
"grad_norm": 0.38537442684173584,
"learning_rate": 4.623655913978495e-06,
"loss": 0.5266,
"step": 129
},
{
"epoch": 0.1400359066427289,
"grad_norm": 0.37685397267341614,
"learning_rate": 4.659498207885305e-06,
"loss": 0.515,
"step": 130
},
{
"epoch": 0.14111310592459606,
"grad_norm": 0.40007224678993225,
"learning_rate": 4.695340501792115e-06,
"loss": 0.5,
"step": 131
},
{
"epoch": 0.1421903052064632,
"grad_norm": 0.4487532675266266,
"learning_rate": 4.731182795698925e-06,
"loss": 0.4944,
"step": 132
},
{
"epoch": 0.14326750448833034,
"grad_norm": 0.37208762764930725,
"learning_rate": 4.767025089605735e-06,
"loss": 0.4854,
"step": 133
},
{
"epoch": 0.14434470377019748,
"grad_norm": 0.36964190006256104,
"learning_rate": 4.802867383512545e-06,
"loss": 0.528,
"step": 134
},
{
"epoch": 0.14542190305206462,
"grad_norm": 0.442440927028656,
"learning_rate": 4.838709677419355e-06,
"loss": 0.5021,
"step": 135
},
{
"epoch": 0.1464991023339318,
"grad_norm": 0.4261108338832855,
"learning_rate": 4.8745519713261655e-06,
"loss": 0.5145,
"step": 136
},
{
"epoch": 0.14757630161579893,
"grad_norm": 0.37978559732437134,
"learning_rate": 4.910394265232976e-06,
"loss": 0.5031,
"step": 137
},
{
"epoch": 0.14865350089766607,
"grad_norm": 0.3788897693157196,
"learning_rate": 4.946236559139785e-06,
"loss": 0.5209,
"step": 138
},
{
"epoch": 0.14973070017953322,
"grad_norm": 0.4541829228401184,
"learning_rate": 4.982078853046595e-06,
"loss": 0.5169,
"step": 139
},
{
"epoch": 0.15080789946140036,
"grad_norm": 0.4718886911869049,
"learning_rate": 5.017921146953405e-06,
"loss": 0.5229,
"step": 140
},
{
"epoch": 0.1518850987432675,
"grad_norm": 0.3844756484031677,
"learning_rate": 5.0537634408602155e-06,
"loss": 0.4953,
"step": 141
},
{
"epoch": 0.15296229802513464,
"grad_norm": 0.4022759199142456,
"learning_rate": 5.089605734767026e-06,
"loss": 0.5184,
"step": 142
},
{
"epoch": 0.15403949730700178,
"grad_norm": 0.47823449969291687,
"learning_rate": 5.125448028673835e-06,
"loss": 0.5207,
"step": 143
},
{
"epoch": 0.15511669658886895,
"grad_norm": 0.38162872195243835,
"learning_rate": 5.161290322580646e-06,
"loss": 0.5119,
"step": 144
},
{
"epoch": 0.1561938958707361,
"grad_norm": 0.4117828607559204,
"learning_rate": 5.197132616487456e-06,
"loss": 0.4929,
"step": 145
},
{
"epoch": 0.15727109515260324,
"grad_norm": 0.4006747305393219,
"learning_rate": 5.232974910394266e-06,
"loss": 0.5193,
"step": 146
},
{
"epoch": 0.15834829443447038,
"grad_norm": 0.40755030512809753,
"learning_rate": 5.268817204301076e-06,
"loss": 0.488,
"step": 147
},
{
"epoch": 0.15942549371633752,
"grad_norm": 0.3836679458618164,
"learning_rate": 5.304659498207886e-06,
"loss": 0.491,
"step": 148
},
{
"epoch": 0.16050269299820466,
"grad_norm": 0.3859885036945343,
"learning_rate": 5.340501792114696e-06,
"loss": 0.5061,
"step": 149
},
{
"epoch": 0.1615798922800718,
"grad_norm": 0.41005122661590576,
"learning_rate": 5.376344086021506e-06,
"loss": 0.4995,
"step": 150
},
{
"epoch": 0.16265709156193897,
"grad_norm": 0.40823987126350403,
"learning_rate": 5.412186379928316e-06,
"loss": 0.5112,
"step": 151
},
{
"epoch": 0.1637342908438061,
"grad_norm": 0.4053882956504822,
"learning_rate": 5.4480286738351265e-06,
"loss": 0.5142,
"step": 152
},
{
"epoch": 0.16481149012567325,
"grad_norm": 0.4068973958492279,
"learning_rate": 5.483870967741935e-06,
"loss": 0.5034,
"step": 153
},
{
"epoch": 0.1658886894075404,
"grad_norm": 0.36175912618637085,
"learning_rate": 5.5197132616487455e-06,
"loss": 0.494,
"step": 154
},
{
"epoch": 0.16696588868940754,
"grad_norm": 0.41930872201919556,
"learning_rate": 5.555555555555557e-06,
"loss": 0.5213,
"step": 155
},
{
"epoch": 0.16804308797127468,
"grad_norm": 0.38556304574012756,
"learning_rate": 5.591397849462365e-06,
"loss": 0.4963,
"step": 156
},
{
"epoch": 0.16912028725314182,
"grad_norm": 0.40004435181617737,
"learning_rate": 5.627240143369176e-06,
"loss": 0.4789,
"step": 157
},
{
"epoch": 0.170197486535009,
"grad_norm": 0.3727085292339325,
"learning_rate": 5.663082437275986e-06,
"loss": 0.5245,
"step": 158
},
{
"epoch": 0.17127468581687613,
"grad_norm": 0.37608808279037476,
"learning_rate": 5.698924731182796e-06,
"loss": 0.5005,
"step": 159
},
{
"epoch": 0.17235188509874327,
"grad_norm": 0.4182046353816986,
"learning_rate": 5.734767025089606e-06,
"loss": 0.4921,
"step": 160
},
{
"epoch": 0.17342908438061042,
"grad_norm": 0.37144991755485535,
"learning_rate": 5.770609318996416e-06,
"loss": 0.4845,
"step": 161
},
{
"epoch": 0.17450628366247756,
"grad_norm": 0.4116148054599762,
"learning_rate": 5.806451612903226e-06,
"loss": 0.4995,
"step": 162
},
{
"epoch": 0.1755834829443447,
"grad_norm": 0.37438467144966125,
"learning_rate": 5.842293906810036e-06,
"loss": 0.4694,
"step": 163
},
{
"epoch": 0.17666068222621184,
"grad_norm": 0.4422660768032074,
"learning_rate": 5.878136200716846e-06,
"loss": 0.4916,
"step": 164
},
{
"epoch": 0.17773788150807898,
"grad_norm": 0.403758704662323,
"learning_rate": 5.9139784946236566e-06,
"loss": 0.5126,
"step": 165
},
{
"epoch": 0.17881508078994615,
"grad_norm": 0.4406896233558655,
"learning_rate": 5.949820788530466e-06,
"loss": 0.4705,
"step": 166
},
{
"epoch": 0.1798922800718133,
"grad_norm": 0.4464195966720581,
"learning_rate": 5.985663082437276e-06,
"loss": 0.5287,
"step": 167
},
{
"epoch": 0.18096947935368043,
"grad_norm": 0.511547327041626,
"learning_rate": 6.021505376344087e-06,
"loss": 0.5037,
"step": 168
},
{
"epoch": 0.18204667863554758,
"grad_norm": 0.4360441267490387,
"learning_rate": 6.057347670250897e-06,
"loss": 0.5254,
"step": 169
},
{
"epoch": 0.18312387791741472,
"grad_norm": 0.44195273518562317,
"learning_rate": 6.0931899641577065e-06,
"loss": 0.5034,
"step": 170
},
{
"epoch": 0.18420107719928186,
"grad_norm": 0.4096381664276123,
"learning_rate": 6.129032258064517e-06,
"loss": 0.498,
"step": 171
},
{
"epoch": 0.185278276481149,
"grad_norm": 0.5594648718833923,
"learning_rate": 6.164874551971327e-06,
"loss": 0.4857,
"step": 172
},
{
"epoch": 0.18635547576301617,
"grad_norm": 0.4190889298915863,
"learning_rate": 6.200716845878137e-06,
"loss": 0.5091,
"step": 173
},
{
"epoch": 0.1874326750448833,
"grad_norm": 0.5348290801048279,
"learning_rate": 6.236559139784947e-06,
"loss": 0.528,
"step": 174
},
{
"epoch": 0.18850987432675045,
"grad_norm": 0.5012261867523193,
"learning_rate": 6.272401433691757e-06,
"loss": 0.4871,
"step": 175
},
{
"epoch": 0.1895870736086176,
"grad_norm": 0.45305487513542175,
"learning_rate": 6.308243727598567e-06,
"loss": 0.4961,
"step": 176
},
{
"epoch": 0.19066427289048474,
"grad_norm": 0.47502607107162476,
"learning_rate": 6.344086021505377e-06,
"loss": 0.5079,
"step": 177
},
{
"epoch": 0.19174147217235188,
"grad_norm": 0.4623589813709259,
"learning_rate": 6.379928315412187e-06,
"loss": 0.494,
"step": 178
},
{
"epoch": 0.19281867145421902,
"grad_norm": 0.3726208209991455,
"learning_rate": 6.415770609318996e-06,
"loss": 0.4845,
"step": 179
},
{
"epoch": 0.19389587073608616,
"grad_norm": 0.49059590697288513,
"learning_rate": 6.451612903225806e-06,
"loss": 0.5111,
"step": 180
},
{
"epoch": 0.19497307001795333,
"grad_norm": 0.4289363920688629,
"learning_rate": 6.4874551971326176e-06,
"loss": 0.4782,
"step": 181
},
{
"epoch": 0.19605026929982047,
"grad_norm": 0.4204278886318207,
"learning_rate": 6.523297491039428e-06,
"loss": 0.4831,
"step": 182
},
{
"epoch": 0.19712746858168761,
"grad_norm": 0.450005441904068,
"learning_rate": 6.5591397849462365e-06,
"loss": 0.4832,
"step": 183
},
{
"epoch": 0.19820466786355476,
"grad_norm": 0.44153162837028503,
"learning_rate": 6.594982078853047e-06,
"loss": 0.5056,
"step": 184
},
{
"epoch": 0.1992818671454219,
"grad_norm": 0.4572013318538666,
"learning_rate": 6.630824372759857e-06,
"loss": 0.4802,
"step": 185
},
{
"epoch": 0.20035906642728904,
"grad_norm": 0.42412129044532776,
"learning_rate": 6.666666666666667e-06,
"loss": 0.4946,
"step": 186
},
{
"epoch": 0.20143626570915618,
"grad_norm": 0.4811611473560333,
"learning_rate": 6.702508960573477e-06,
"loss": 0.4898,
"step": 187
},
{
"epoch": 0.20251346499102335,
"grad_norm": 0.47502169013023376,
"learning_rate": 6.738351254480287e-06,
"loss": 0.5057,
"step": 188
},
{
"epoch": 0.2035906642728905,
"grad_norm": 0.5099421143531799,
"learning_rate": 6.774193548387097e-06,
"loss": 0.4911,
"step": 189
},
{
"epoch": 0.20466786355475763,
"grad_norm": 0.5116240382194519,
"learning_rate": 6.810035842293907e-06,
"loss": 0.4898,
"step": 190
},
{
"epoch": 0.20574506283662478,
"grad_norm": 0.44598788022994995,
"learning_rate": 6.8458781362007174e-06,
"loss": 0.507,
"step": 191
},
{
"epoch": 0.20682226211849192,
"grad_norm": 0.45186325907707214,
"learning_rate": 6.881720430107528e-06,
"loss": 0.4797,
"step": 192
},
{
"epoch": 0.20789946140035906,
"grad_norm": 0.5284622311592102,
"learning_rate": 6.917562724014337e-06,
"loss": 0.4945,
"step": 193
},
{
"epoch": 0.2089766606822262,
"grad_norm": 0.39712581038475037,
"learning_rate": 6.9534050179211476e-06,
"loss": 0.5008,
"step": 194
},
{
"epoch": 0.21005385996409337,
"grad_norm": 0.44355469942092896,
"learning_rate": 6.989247311827958e-06,
"loss": 0.4826,
"step": 195
},
{
"epoch": 0.2111310592459605,
"grad_norm": 0.40434572100639343,
"learning_rate": 7.025089605734767e-06,
"loss": 0.468,
"step": 196
},
{
"epoch": 0.21220825852782765,
"grad_norm": 0.43578436970710754,
"learning_rate": 7.060931899641578e-06,
"loss": 0.4916,
"step": 197
},
{
"epoch": 0.2132854578096948,
"grad_norm": 0.4185442626476288,
"learning_rate": 7.096774193548388e-06,
"loss": 0.498,
"step": 198
},
{
"epoch": 0.21436265709156194,
"grad_norm": 0.41103053092956543,
"learning_rate": 7.1326164874551975e-06,
"loss": 0.4768,
"step": 199
},
{
"epoch": 0.21543985637342908,
"grad_norm": 0.4058842062950134,
"learning_rate": 7.168458781362008e-06,
"loss": 0.4918,
"step": 200
},
{
"epoch": 0.21651705565529622,
"grad_norm": 0.3815259039402008,
"learning_rate": 7.204301075268818e-06,
"loss": 0.5033,
"step": 201
},
{
"epoch": 0.21759425493716336,
"grad_norm": 0.4531411826610565,
"learning_rate": 7.240143369175628e-06,
"loss": 0.4867,
"step": 202
},
{
"epoch": 0.21867145421903053,
"grad_norm": 0.3983916640281677,
"learning_rate": 7.275985663082438e-06,
"loss": 0.467,
"step": 203
},
{
"epoch": 0.21974865350089767,
"grad_norm": 0.4266183078289032,
"learning_rate": 7.311827956989248e-06,
"loss": 0.4977,
"step": 204
},
{
"epoch": 0.22082585278276481,
"grad_norm": 0.4078884422779083,
"learning_rate": 7.347670250896059e-06,
"loss": 0.4827,
"step": 205
},
{
"epoch": 0.22190305206463196,
"grad_norm": 0.403360515832901,
"learning_rate": 7.383512544802868e-06,
"loss": 0.4696,
"step": 206
},
{
"epoch": 0.2229802513464991,
"grad_norm": 0.46340030431747437,
"learning_rate": 7.4193548387096784e-06,
"loss": 0.503,
"step": 207
},
{
"epoch": 0.22405745062836624,
"grad_norm": 0.431130975484848,
"learning_rate": 7.455197132616489e-06,
"loss": 0.524,
"step": 208
},
{
"epoch": 0.22513464991023338,
"grad_norm": 0.42561987042427063,
"learning_rate": 7.491039426523297e-06,
"loss": 0.5082,
"step": 209
},
{
"epoch": 0.22621184919210055,
"grad_norm": 0.42905059456825256,
"learning_rate": 7.526881720430108e-06,
"loss": 0.4712,
"step": 210
},
{
"epoch": 0.2272890484739677,
"grad_norm": 0.4395630359649658,
"learning_rate": 7.562724014336919e-06,
"loss": 0.4763,
"step": 211
},
{
"epoch": 0.22836624775583483,
"grad_norm": 0.38788971304893494,
"learning_rate": 7.5985663082437275e-06,
"loss": 0.4783,
"step": 212
},
{
"epoch": 0.22944344703770198,
"grad_norm": 0.4753347635269165,
"learning_rate": 7.634408602150538e-06,
"loss": 0.4787,
"step": 213
},
{
"epoch": 0.23052064631956912,
"grad_norm": 0.45695263147354126,
"learning_rate": 7.670250896057349e-06,
"loss": 0.494,
"step": 214
},
{
"epoch": 0.23159784560143626,
"grad_norm": 0.43443480134010315,
"learning_rate": 7.706093189964159e-06,
"loss": 0.4722,
"step": 215
},
{
"epoch": 0.2326750448833034,
"grad_norm": 0.4508543610572815,
"learning_rate": 7.741935483870968e-06,
"loss": 0.4808,
"step": 216
},
{
"epoch": 0.23375224416517057,
"grad_norm": 0.4751185178756714,
"learning_rate": 7.77777777777778e-06,
"loss": 0.4708,
"step": 217
},
{
"epoch": 0.2348294434470377,
"grad_norm": 0.3885140120983124,
"learning_rate": 7.813620071684589e-06,
"loss": 0.481,
"step": 218
},
{
"epoch": 0.23590664272890485,
"grad_norm": 0.4592781662940979,
"learning_rate": 7.849462365591398e-06,
"loss": 0.4857,
"step": 219
},
{
"epoch": 0.236983842010772,
"grad_norm": 0.49362388253211975,
"learning_rate": 7.88530465949821e-06,
"loss": 0.4959,
"step": 220
},
{
"epoch": 0.23806104129263914,
"grad_norm": 0.41868501901626587,
"learning_rate": 7.921146953405019e-06,
"loss": 0.499,
"step": 221
},
{
"epoch": 0.23913824057450628,
"grad_norm": 0.4312754273414612,
"learning_rate": 7.956989247311828e-06,
"loss": 0.4841,
"step": 222
},
{
"epoch": 0.24021543985637342,
"grad_norm": 0.4530230164527893,
"learning_rate": 7.992831541218638e-06,
"loss": 0.5207,
"step": 223
},
{
"epoch": 0.24129263913824056,
"grad_norm": 0.38147202134132385,
"learning_rate": 8.028673835125449e-06,
"loss": 0.4681,
"step": 224
},
{
"epoch": 0.24236983842010773,
"grad_norm": 0.4324093759059906,
"learning_rate": 8.064516129032258e-06,
"loss": 0.4878,
"step": 225
},
{
"epoch": 0.24344703770197487,
"grad_norm": 0.46749916672706604,
"learning_rate": 8.100358422939068e-06,
"loss": 0.4845,
"step": 226
},
{
"epoch": 0.244524236983842,
"grad_norm": 0.4217725694179535,
"learning_rate": 8.136200716845879e-06,
"loss": 0.4637,
"step": 227
},
{
"epoch": 0.24560143626570916,
"grad_norm": 0.4384757876396179,
"learning_rate": 8.172043010752689e-06,
"loss": 0.4792,
"step": 228
},
{
"epoch": 0.2466786355475763,
"grad_norm": 0.44224825501441956,
"learning_rate": 8.207885304659498e-06,
"loss": 0.4829,
"step": 229
},
{
"epoch": 0.24775583482944344,
"grad_norm": 0.4307554066181183,
"learning_rate": 8.24372759856631e-06,
"loss": 0.4907,
"step": 230
},
{
"epoch": 0.24883303411131058,
"grad_norm": 0.42513298988342285,
"learning_rate": 8.279569892473119e-06,
"loss": 0.4889,
"step": 231
},
{
"epoch": 0.24991023339317775,
"grad_norm": 0.4699139893054962,
"learning_rate": 8.315412186379928e-06,
"loss": 0.4831,
"step": 232
},
{
"epoch": 0.25098743267504486,
"grad_norm": 0.4631340205669403,
"learning_rate": 8.35125448028674e-06,
"loss": 0.4589,
"step": 233
},
{
"epoch": 0.25206463195691203,
"grad_norm": 0.4672948122024536,
"learning_rate": 8.387096774193549e-06,
"loss": 0.4882,
"step": 234
},
{
"epoch": 0.25314183123877915,
"grad_norm": 0.4274214804172516,
"learning_rate": 8.422939068100358e-06,
"loss": 0.4877,
"step": 235
},
{
"epoch": 0.2542190305206463,
"grad_norm": 0.5479405522346497,
"learning_rate": 8.45878136200717e-06,
"loss": 0.4791,
"step": 236
},
{
"epoch": 0.2552962298025135,
"grad_norm": 0.41502436995506287,
"learning_rate": 8.494623655913979e-06,
"loss": 0.4607,
"step": 237
},
{
"epoch": 0.2563734290843806,
"grad_norm": 0.44688135385513306,
"learning_rate": 8.530465949820788e-06,
"loss": 0.4277,
"step": 238
},
{
"epoch": 0.25745062836624777,
"grad_norm": 0.48682472109794617,
"learning_rate": 8.5663082437276e-06,
"loss": 0.4692,
"step": 239
},
{
"epoch": 0.2585278276481149,
"grad_norm": 0.5049644708633423,
"learning_rate": 8.602150537634409e-06,
"loss": 0.4812,
"step": 240
},
{
"epoch": 0.25960502692998205,
"grad_norm": 0.44088059663772583,
"learning_rate": 8.63799283154122e-06,
"loss": 0.4895,
"step": 241
},
{
"epoch": 0.26068222621184917,
"grad_norm": 0.5026620030403137,
"learning_rate": 8.67383512544803e-06,
"loss": 0.4683,
"step": 242
},
{
"epoch": 0.26175942549371634,
"grad_norm": 0.41200295090675354,
"learning_rate": 8.70967741935484e-06,
"loss": 0.4672,
"step": 243
},
{
"epoch": 0.2628366247755835,
"grad_norm": 0.4825300872325897,
"learning_rate": 8.74551971326165e-06,
"loss": 0.4735,
"step": 244
},
{
"epoch": 0.2639138240574506,
"grad_norm": 0.45688295364379883,
"learning_rate": 8.78136200716846e-06,
"loss": 0.4809,
"step": 245
},
{
"epoch": 0.2649910233393178,
"grad_norm": 0.4559200704097748,
"learning_rate": 8.81720430107527e-06,
"loss": 0.4656,
"step": 246
},
{
"epoch": 0.2660682226211849,
"grad_norm": 0.49576395750045776,
"learning_rate": 8.85304659498208e-06,
"loss": 0.4585,
"step": 247
},
{
"epoch": 0.26714542190305207,
"grad_norm": 0.5066803693771362,
"learning_rate": 8.888888888888888e-06,
"loss": 0.4776,
"step": 248
},
{
"epoch": 0.2682226211849192,
"grad_norm": 0.45653069019317627,
"learning_rate": 8.9247311827957e-06,
"loss": 0.4718,
"step": 249
},
{
"epoch": 0.26929982046678635,
"grad_norm": 0.5092087984085083,
"learning_rate": 8.96057347670251e-06,
"loss": 0.4792,
"step": 250
},
{
"epoch": 0.2703770197486535,
"grad_norm": 0.45919767022132874,
"learning_rate": 8.99641577060932e-06,
"loss": 0.4899,
"step": 251
},
{
"epoch": 0.27145421903052064,
"grad_norm": 0.4007905125617981,
"learning_rate": 9.03225806451613e-06,
"loss": 0.4885,
"step": 252
},
{
"epoch": 0.2725314183123878,
"grad_norm": 0.43509814143180847,
"learning_rate": 9.068100358422939e-06,
"loss": 0.488,
"step": 253
},
{
"epoch": 0.2736086175942549,
"grad_norm": 0.4242767095565796,
"learning_rate": 9.10394265232975e-06,
"loss": 0.4712,
"step": 254
},
{
"epoch": 0.2746858168761221,
"grad_norm": 0.41952240467071533,
"learning_rate": 9.13978494623656e-06,
"loss": 0.4879,
"step": 255
},
{
"epoch": 0.2757630161579892,
"grad_norm": 0.4186972677707672,
"learning_rate": 9.17562724014337e-06,
"loss": 0.4629,
"step": 256
},
{
"epoch": 0.2768402154398564,
"grad_norm": 0.49089375138282776,
"learning_rate": 9.21146953405018e-06,
"loss": 0.4804,
"step": 257
},
{
"epoch": 0.27791741472172354,
"grad_norm": 0.43804821372032166,
"learning_rate": 9.24731182795699e-06,
"loss": 0.4691,
"step": 258
},
{
"epoch": 0.27899461400359066,
"grad_norm": 0.4435538053512573,
"learning_rate": 9.2831541218638e-06,
"loss": 0.459,
"step": 259
},
{
"epoch": 0.2800718132854578,
"grad_norm": 0.47574469447135925,
"learning_rate": 9.31899641577061e-06,
"loss": 0.4785,
"step": 260
},
{
"epoch": 0.28114901256732494,
"grad_norm": 0.41517260670661926,
"learning_rate": 9.35483870967742e-06,
"loss": 0.5019,
"step": 261
},
{
"epoch": 0.2822262118491921,
"grad_norm": 0.4552224278450012,
"learning_rate": 9.39068100358423e-06,
"loss": 0.4993,
"step": 262
},
{
"epoch": 0.2833034111310592,
"grad_norm": 0.4515886902809143,
"learning_rate": 9.42652329749104e-06,
"loss": 0.4738,
"step": 263
},
{
"epoch": 0.2843806104129264,
"grad_norm": 0.47657546401023865,
"learning_rate": 9.46236559139785e-06,
"loss": 0.4634,
"step": 264
},
{
"epoch": 0.28545780969479356,
"grad_norm": 0.45850643515586853,
"learning_rate": 9.49820788530466e-06,
"loss": 0.4745,
"step": 265
},
{
"epoch": 0.2865350089766607,
"grad_norm": 0.469215989112854,
"learning_rate": 9.53405017921147e-06,
"loss": 0.4602,
"step": 266
},
{
"epoch": 0.28761220825852785,
"grad_norm": 0.4392701983451843,
"learning_rate": 9.56989247311828e-06,
"loss": 0.4716,
"step": 267
},
{
"epoch": 0.28868940754039496,
"grad_norm": 0.40122461318969727,
"learning_rate": 9.60573476702509e-06,
"loss": 0.4639,
"step": 268
},
{
"epoch": 0.28976660682226213,
"grad_norm": 0.4862971603870392,
"learning_rate": 9.641577060931901e-06,
"loss": 0.4915,
"step": 269
},
{
"epoch": 0.29084380610412924,
"grad_norm": 0.4537064731121063,
"learning_rate": 9.67741935483871e-06,
"loss": 0.4882,
"step": 270
},
{
"epoch": 0.2919210053859964,
"grad_norm": 0.478216290473938,
"learning_rate": 9.71326164874552e-06,
"loss": 0.4644,
"step": 271
},
{
"epoch": 0.2929982046678636,
"grad_norm": 0.5602977871894836,
"learning_rate": 9.749103942652331e-06,
"loss": 0.4738,
"step": 272
},
{
"epoch": 0.2940754039497307,
"grad_norm": 0.5121861100196838,
"learning_rate": 9.78494623655914e-06,
"loss": 0.4917,
"step": 273
},
{
"epoch": 0.29515260323159787,
"grad_norm": 0.5532698631286621,
"learning_rate": 9.820788530465952e-06,
"loss": 0.4485,
"step": 274
},
{
"epoch": 0.296229802513465,
"grad_norm": 0.49017244577407837,
"learning_rate": 9.856630824372761e-06,
"loss": 0.4491,
"step": 275
},
{
"epoch": 0.29730700179533215,
"grad_norm": 0.5022942423820496,
"learning_rate": 9.89247311827957e-06,
"loss": 0.4817,
"step": 276
},
{
"epoch": 0.29838420107719926,
"grad_norm": 0.4923330545425415,
"learning_rate": 9.928315412186382e-06,
"loss": 0.4763,
"step": 277
},
{
"epoch": 0.29946140035906643,
"grad_norm": 0.46441808342933655,
"learning_rate": 9.96415770609319e-06,
"loss": 0.481,
"step": 278
},
{
"epoch": 0.30053859964093355,
"grad_norm": 0.49813228845596313,
"learning_rate": 1e-05,
"loss": 0.4779,
"step": 279
},
{
"epoch": 0.3016157989228007,
"grad_norm": 0.533608078956604,
"learning_rate": 9.999996067902875e-06,
"loss": 0.4802,
"step": 280
},
{
"epoch": 0.3026929982046679,
"grad_norm": 0.5138203501701355,
"learning_rate": 9.999984271617681e-06,
"loss": 0.4656,
"step": 281
},
{
"epoch": 0.303770197486535,
"grad_norm": 0.45372065901756287,
"learning_rate": 9.999964611162973e-06,
"loss": 0.4577,
"step": 282
},
{
"epoch": 0.30484739676840217,
"grad_norm": 0.5210473537445068,
"learning_rate": 9.999937086569674e-06,
"loss": 0.4922,
"step": 283
},
{
"epoch": 0.3059245960502693,
"grad_norm": 0.49455979466438293,
"learning_rate": 9.999901697881075e-06,
"loss": 0.4849,
"step": 284
},
{
"epoch": 0.30700179533213645,
"grad_norm": 0.4365752935409546,
"learning_rate": 9.999858445152838e-06,
"loss": 0.4809,
"step": 285
},
{
"epoch": 0.30807899461400357,
"grad_norm": 0.559883177280426,
"learning_rate": 9.999807328452991e-06,
"loss": 0.4971,
"step": 286
},
{
"epoch": 0.30915619389587073,
"grad_norm": 0.45351046323776245,
"learning_rate": 9.999748347861935e-06,
"loss": 0.4844,
"step": 287
},
{
"epoch": 0.3102333931777379,
"grad_norm": 0.5587484240531921,
"learning_rate": 9.999681503472433e-06,
"loss": 0.4572,
"step": 288
},
{
"epoch": 0.311310592459605,
"grad_norm": 0.46588122844696045,
"learning_rate": 9.999606795389623e-06,
"loss": 0.4648,
"step": 289
},
{
"epoch": 0.3123877917414722,
"grad_norm": 0.4443756639957428,
"learning_rate": 9.999524223731009e-06,
"loss": 0.4749,
"step": 290
},
{
"epoch": 0.3134649910233393,
"grad_norm": 0.48129507899284363,
"learning_rate": 9.999433788626461e-06,
"loss": 0.4772,
"step": 291
},
{
"epoch": 0.31454219030520647,
"grad_norm": 0.42726975679397583,
"learning_rate": 9.999335490218221e-06,
"loss": 0.4763,
"step": 292
},
{
"epoch": 0.3156193895870736,
"grad_norm": 0.4765612781047821,
"learning_rate": 9.999229328660896e-06,
"loss": 0.4881,
"step": 293
},
{
"epoch": 0.31669658886894075,
"grad_norm": 0.40721437335014343,
"learning_rate": 9.999115304121459e-06,
"loss": 0.4688,
"step": 294
},
{
"epoch": 0.3177737881508079,
"grad_norm": 0.4598512649536133,
"learning_rate": 9.998993416779254e-06,
"loss": 0.4472,
"step": 295
},
{
"epoch": 0.31885098743267504,
"grad_norm": 0.46925970911979675,
"learning_rate": 9.99886366682599e-06,
"loss": 0.4569,
"step": 296
},
{
"epoch": 0.3199281867145422,
"grad_norm": 0.4148475229740143,
"learning_rate": 9.998726054465746e-06,
"loss": 0.4615,
"step": 297
},
{
"epoch": 0.3210053859964093,
"grad_norm": 0.4196060299873352,
"learning_rate": 9.998580579914958e-06,
"loss": 0.4578,
"step": 298
},
{
"epoch": 0.3220825852782765,
"grad_norm": 0.4774996042251587,
"learning_rate": 9.998427243402437e-06,
"loss": 0.4582,
"step": 299
},
{
"epoch": 0.3231597845601436,
"grad_norm": 0.47536271810531616,
"learning_rate": 9.998266045169356e-06,
"loss": 0.4774,
"step": 300
},
{
"epoch": 0.3242369838420108,
"grad_norm": 0.46730348467826843,
"learning_rate": 9.998096985469255e-06,
"loss": 0.4848,
"step": 301
},
{
"epoch": 0.32531418312387794,
"grad_norm": 0.46201860904693604,
"learning_rate": 9.997920064568037e-06,
"loss": 0.471,
"step": 302
},
{
"epoch": 0.32639138240574506,
"grad_norm": 0.46686047315597534,
"learning_rate": 9.99773528274397e-06,
"loss": 0.4181,
"step": 303
},
{
"epoch": 0.3274685816876122,
"grad_norm": 0.41051632165908813,
"learning_rate": 9.997542640287686e-06,
"loss": 0.4339,
"step": 304
},
{
"epoch": 0.32854578096947934,
"grad_norm": 0.49075937271118164,
"learning_rate": 9.997342137502182e-06,
"loss": 0.4619,
"step": 305
},
{
"epoch": 0.3296229802513465,
"grad_norm": 0.43930885195732117,
"learning_rate": 9.997133774702813e-06,
"loss": 0.4411,
"step": 306
},
{
"epoch": 0.3307001795332136,
"grad_norm": 0.46991202235221863,
"learning_rate": 9.996917552217303e-06,
"loss": 0.4657,
"step": 307
},
{
"epoch": 0.3317773788150808,
"grad_norm": 0.3972565829753876,
"learning_rate": 9.996693470385735e-06,
"loss": 0.4495,
"step": 308
},
{
"epoch": 0.33285457809694796,
"grad_norm": 0.4870466887950897,
"learning_rate": 9.996461529560553e-06,
"loss": 0.482,
"step": 309
},
{
"epoch": 0.3339317773788151,
"grad_norm": 0.4203915297985077,
"learning_rate": 9.996221730106561e-06,
"loss": 0.4539,
"step": 310
},
{
"epoch": 0.33500897666068225,
"grad_norm": 0.3791579604148865,
"learning_rate": 9.995974072400928e-06,
"loss": 0.4676,
"step": 311
},
{
"epoch": 0.33608617594254936,
"grad_norm": 0.4781574308872223,
"learning_rate": 9.995718556833179e-06,
"loss": 0.4452,
"step": 312
},
{
"epoch": 0.33716337522441653,
"grad_norm": 0.4126666486263275,
"learning_rate": 9.995455183805195e-06,
"loss": 0.4653,
"step": 313
},
{
"epoch": 0.33824057450628364,
"grad_norm": 0.4394056797027588,
"learning_rate": 9.995183953731225e-06,
"loss": 0.4594,
"step": 314
},
{
"epoch": 0.3393177737881508,
"grad_norm": 0.4282722473144531,
"learning_rate": 9.994904867037867e-06,
"loss": 0.4553,
"step": 315
},
{
"epoch": 0.340394973070018,
"grad_norm": 0.41215622425079346,
"learning_rate": 9.99461792416408e-06,
"loss": 0.4687,
"step": 316
},
{
"epoch": 0.3414721723518851,
"grad_norm": 0.44896167516708374,
"learning_rate": 9.994323125561179e-06,
"loss": 0.4601,
"step": 317
},
{
"epoch": 0.34254937163375226,
"grad_norm": 0.40661004185676575,
"learning_rate": 9.994020471692832e-06,
"loss": 0.4469,
"step": 318
},
{
"epoch": 0.3436265709156194,
"grad_norm": 0.48509952425956726,
"learning_rate": 9.99370996303507e-06,
"loss": 0.4431,
"step": 319
},
{
"epoch": 0.34470377019748655,
"grad_norm": 0.46913495659828186,
"learning_rate": 9.993391600076268e-06,
"loss": 0.4676,
"step": 320
},
{
"epoch": 0.34578096947935366,
"grad_norm": 0.4532509446144104,
"learning_rate": 9.993065383317164e-06,
"loss": 0.4637,
"step": 321
},
{
"epoch": 0.34685816876122083,
"grad_norm": 0.4724155068397522,
"learning_rate": 9.992731313270841e-06,
"loss": 0.4771,
"step": 322
},
{
"epoch": 0.34793536804308794,
"grad_norm": 0.42328765988349915,
"learning_rate": 9.99238939046274e-06,
"loss": 0.45,
"step": 323
},
{
"epoch": 0.3490125673249551,
"grad_norm": 0.5050341486930847,
"learning_rate": 9.992039615430648e-06,
"loss": 0.4599,
"step": 324
},
{
"epoch": 0.3500897666068223,
"grad_norm": 0.4233899712562561,
"learning_rate": 9.991681988724706e-06,
"loss": 0.4778,
"step": 325
},
{
"epoch": 0.3511669658886894,
"grad_norm": 0.5298818945884705,
"learning_rate": 9.991316510907403e-06,
"loss": 0.4604,
"step": 326
},
{
"epoch": 0.35224416517055657,
"grad_norm": 0.43540188670158386,
"learning_rate": 9.990943182553578e-06,
"loss": 0.4701,
"step": 327
},
{
"epoch": 0.3533213644524237,
"grad_norm": 0.47227743268013,
"learning_rate": 9.990562004250415e-06,
"loss": 0.4545,
"step": 328
},
{
"epoch": 0.35439856373429085,
"grad_norm": 0.5041068196296692,
"learning_rate": 9.990172976597446e-06,
"loss": 0.493,
"step": 329
},
{
"epoch": 0.35547576301615796,
"grad_norm": 0.43627163767814636,
"learning_rate": 9.989776100206547e-06,
"loss": 0.4389,
"step": 330
},
{
"epoch": 0.35655296229802513,
"grad_norm": 0.4468795359134674,
"learning_rate": 9.989371375701943e-06,
"loss": 0.468,
"step": 331
},
{
"epoch": 0.3576301615798923,
"grad_norm": 0.4504821300506592,
"learning_rate": 9.988958803720203e-06,
"loss": 0.4567,
"step": 332
},
{
"epoch": 0.3587073608617594,
"grad_norm": 0.4463129937648773,
"learning_rate": 9.988538384910231e-06,
"loss": 0.4592,
"step": 333
},
{
"epoch": 0.3597845601436266,
"grad_norm": 0.376647025346756,
"learning_rate": 9.988110119933281e-06,
"loss": 0.444,
"step": 334
},
{
"epoch": 0.3608617594254937,
"grad_norm": 0.49062368273735046,
"learning_rate": 9.987674009462943e-06,
"loss": 0.48,
"step": 335
},
{
"epoch": 0.36193895870736087,
"grad_norm": 0.4553554058074951,
"learning_rate": 9.98723005418515e-06,
"loss": 0.4651,
"step": 336
},
{
"epoch": 0.363016157989228,
"grad_norm": 0.559506893157959,
"learning_rate": 9.986778254798173e-06,
"loss": 0.4609,
"step": 337
},
{
"epoch": 0.36409335727109515,
"grad_norm": 0.43453335762023926,
"learning_rate": 9.986318612012618e-06,
"loss": 0.461,
"step": 338
},
{
"epoch": 0.3651705565529623,
"grad_norm": 0.44681429862976074,
"learning_rate": 9.985851126551428e-06,
"loss": 0.471,
"step": 339
},
{
"epoch": 0.36624775583482944,
"grad_norm": 0.46860507130622864,
"learning_rate": 9.985375799149883e-06,
"loss": 0.4694,
"step": 340
},
{
"epoch": 0.3673249551166966,
"grad_norm": 0.48733362555503845,
"learning_rate": 9.9848926305556e-06,
"loss": 0.4881,
"step": 341
},
{
"epoch": 0.3684021543985637,
"grad_norm": 0.4342585802078247,
"learning_rate": 9.984401621528521e-06,
"loss": 0.4841,
"step": 342
},
{
"epoch": 0.3694793536804309,
"grad_norm": 0.46184322237968445,
"learning_rate": 9.983902772840925e-06,
"loss": 0.4956,
"step": 343
},
{
"epoch": 0.370556552962298,
"grad_norm": 0.41323983669281006,
"learning_rate": 9.983396085277421e-06,
"loss": 0.4476,
"step": 344
},
{
"epoch": 0.37163375224416517,
"grad_norm": 0.423006147146225,
"learning_rate": 9.982881559634946e-06,
"loss": 0.4556,
"step": 345
},
{
"epoch": 0.37271095152603234,
"grad_norm": 0.45063871145248413,
"learning_rate": 9.982359196722769e-06,
"loss": 0.4662,
"step": 346
},
{
"epoch": 0.37378815080789946,
"grad_norm": 0.4173057973384857,
"learning_rate": 9.981828997362481e-06,
"loss": 0.4592,
"step": 347
},
{
"epoch": 0.3748653500897666,
"grad_norm": 0.4817594587802887,
"learning_rate": 9.981290962387998e-06,
"loss": 0.4571,
"step": 348
},
{
"epoch": 0.37594254937163374,
"grad_norm": 0.4868641197681427,
"learning_rate": 9.980745092645564e-06,
"loss": 0.4797,
"step": 349
},
{
"epoch": 0.3770197486535009,
"grad_norm": 0.4591589868068695,
"learning_rate": 9.980191388993745e-06,
"loss": 0.466,
"step": 350
},
{
"epoch": 0.378096947935368,
"grad_norm": 0.4101882576942444,
"learning_rate": 9.979629852303426e-06,
"loss": 0.47,
"step": 351
},
{
"epoch": 0.3791741472172352,
"grad_norm": 0.5075291991233826,
"learning_rate": 9.979060483457813e-06,
"loss": 0.4507,
"step": 352
},
{
"epoch": 0.38025134649910236,
"grad_norm": 0.4138559401035309,
"learning_rate": 9.978483283352438e-06,
"loss": 0.4771,
"step": 353
},
{
"epoch": 0.3813285457809695,
"grad_norm": 0.5149195790290833,
"learning_rate": 9.977898252895133e-06,
"loss": 0.4691,
"step": 354
},
{
"epoch": 0.38240574506283664,
"grad_norm": 0.43113377690315247,
"learning_rate": 9.977305393006066e-06,
"loss": 0.4717,
"step": 355
},
{
"epoch": 0.38348294434470376,
"grad_norm": 0.48590296506881714,
"learning_rate": 9.976704704617706e-06,
"loss": 0.4621,
"step": 356
},
{
"epoch": 0.3845601436265709,
"grad_norm": 0.4883146286010742,
"learning_rate": 9.976096188674837e-06,
"loss": 0.4647,
"step": 357
},
{
"epoch": 0.38563734290843804,
"grad_norm": 0.5317089557647705,
"learning_rate": 9.975479846134561e-06,
"loss": 0.4651,
"step": 358
},
{
"epoch": 0.3867145421903052,
"grad_norm": 0.5008561015129089,
"learning_rate": 9.974855677966283e-06,
"loss": 0.4538,
"step": 359
},
{
"epoch": 0.3877917414721723,
"grad_norm": 0.48376259207725525,
"learning_rate": 9.97422368515172e-06,
"loss": 0.4654,
"step": 360
},
{
"epoch": 0.3888689407540395,
"grad_norm": 0.4275338053703308,
"learning_rate": 9.973583868684892e-06,
"loss": 0.4519,
"step": 361
},
{
"epoch": 0.38994614003590666,
"grad_norm": 0.4228547215461731,
"learning_rate": 9.972936229572132e-06,
"loss": 0.4502,
"step": 362
},
{
"epoch": 0.3910233393177738,
"grad_norm": 0.4408591687679291,
"learning_rate": 9.972280768832068e-06,
"loss": 0.4262,
"step": 363
},
{
"epoch": 0.39210053859964095,
"grad_norm": 0.4212534427642822,
"learning_rate": 9.971617487495635e-06,
"loss": 0.4564,
"step": 364
},
{
"epoch": 0.39317773788150806,
"grad_norm": 0.5158016681671143,
"learning_rate": 9.97094638660607e-06,
"loss": 0.4892,
"step": 365
},
{
"epoch": 0.39425493716337523,
"grad_norm": 0.4569847285747528,
"learning_rate": 9.970267467218905e-06,
"loss": 0.459,
"step": 366
},
{
"epoch": 0.39533213644524234,
"grad_norm": 0.44112586975097656,
"learning_rate": 9.969580730401966e-06,
"loss": 0.4739,
"step": 367
},
{
"epoch": 0.3964093357271095,
"grad_norm": 0.5548210740089417,
"learning_rate": 9.968886177235388e-06,
"loss": 0.4782,
"step": 368
},
{
"epoch": 0.3974865350089767,
"grad_norm": 0.4073880910873413,
"learning_rate": 9.968183808811586e-06,
"loss": 0.4584,
"step": 369
},
{
"epoch": 0.3985637342908438,
"grad_norm": 0.4366508722305298,
"learning_rate": 9.967473626235273e-06,
"loss": 0.4633,
"step": 370
},
{
"epoch": 0.39964093357271097,
"grad_norm": 0.4878612160682678,
"learning_rate": 9.966755630623452e-06,
"loss": 0.4558,
"step": 371
},
{
"epoch": 0.4007181328545781,
"grad_norm": 0.4154806435108185,
"learning_rate": 9.966029823105415e-06,
"loss": 0.4659,
"step": 372
},
{
"epoch": 0.40179533213644525,
"grad_norm": 0.512233555316925,
"learning_rate": 9.965296204822741e-06,
"loss": 0.4756,
"step": 373
},
{
"epoch": 0.40287253141831236,
"grad_norm": 0.4381474554538727,
"learning_rate": 9.964554776929289e-06,
"loss": 0.4753,
"step": 374
},
{
"epoch": 0.40394973070017953,
"grad_norm": 0.5151176452636719,
"learning_rate": 9.96380554059121e-06,
"loss": 0.4581,
"step": 375
},
{
"epoch": 0.4050269299820467,
"grad_norm": 0.5161194205284119,
"learning_rate": 9.963048496986933e-06,
"loss": 0.4902,
"step": 376
},
{
"epoch": 0.4061041292639138,
"grad_norm": 0.44209739565849304,
"learning_rate": 9.962283647307162e-06,
"loss": 0.4777,
"step": 377
},
{
"epoch": 0.407181328545781,
"grad_norm": 0.4685749411582947,
"learning_rate": 9.961510992754883e-06,
"loss": 0.4763,
"step": 378
},
{
"epoch": 0.4082585278276481,
"grad_norm": 0.43372246623039246,
"learning_rate": 9.960730534545357e-06,
"loss": 0.4455,
"step": 379
},
{
"epoch": 0.40933572710951527,
"grad_norm": 0.4805338382720947,
"learning_rate": 9.95994227390612e-06,
"loss": 0.4361,
"step": 380
},
{
"epoch": 0.4104129263913824,
"grad_norm": 0.3987146317958832,
"learning_rate": 9.95914621207698e-06,
"loss": 0.4497,
"step": 381
},
{
"epoch": 0.41149012567324955,
"grad_norm": 0.43490350246429443,
"learning_rate": 9.958342350310014e-06,
"loss": 0.4593,
"step": 382
},
{
"epoch": 0.4125673249551167,
"grad_norm": 0.3805745542049408,
"learning_rate": 9.957530689869561e-06,
"loss": 0.4567,
"step": 383
},
{
"epoch": 0.41364452423698383,
"grad_norm": 0.42765673995018005,
"learning_rate": 9.95671123203224e-06,
"loss": 0.4404,
"step": 384
},
{
"epoch": 0.414721723518851,
"grad_norm": 0.41294559836387634,
"learning_rate": 9.955883978086922e-06,
"loss": 0.458,
"step": 385
},
{
"epoch": 0.4157989228007181,
"grad_norm": 0.4390866756439209,
"learning_rate": 9.955048929334744e-06,
"loss": 0.4655,
"step": 386
},
{
"epoch": 0.4168761220825853,
"grad_norm": 0.4394264817237854,
"learning_rate": 9.954206087089107e-06,
"loss": 0.4548,
"step": 387
},
{
"epoch": 0.4179533213644524,
"grad_norm": 0.39660122990608215,
"learning_rate": 9.953355452675661e-06,
"loss": 0.4537,
"step": 388
},
{
"epoch": 0.41903052064631957,
"grad_norm": 0.4672384262084961,
"learning_rate": 9.95249702743232e-06,
"loss": 0.4859,
"step": 389
},
{
"epoch": 0.42010771992818674,
"grad_norm": 0.45546823740005493,
"learning_rate": 9.951630812709245e-06,
"loss": 0.4561,
"step": 390
},
{
"epoch": 0.42118491921005385,
"grad_norm": 0.5013604760169983,
"learning_rate": 9.950756809868858e-06,
"loss": 0.4402,
"step": 391
},
{
"epoch": 0.422262118491921,
"grad_norm": 0.43657436966896057,
"learning_rate": 9.94987502028582e-06,
"loss": 0.4837,
"step": 392
},
{
"epoch": 0.42333931777378814,
"grad_norm": 0.5739855766296387,
"learning_rate": 9.948985445347045e-06,
"loss": 0.4878,
"step": 393
},
{
"epoch": 0.4244165170556553,
"grad_norm": 0.4679727554321289,
"learning_rate": 9.948088086451692e-06,
"loss": 0.4489,
"step": 394
},
{
"epoch": 0.4254937163375224,
"grad_norm": 0.5196583271026611,
"learning_rate": 9.94718294501116e-06,
"loss": 0.4685,
"step": 395
},
{
"epoch": 0.4265709156193896,
"grad_norm": 0.5366904735565186,
"learning_rate": 9.946270022449093e-06,
"loss": 0.4555,
"step": 396
},
{
"epoch": 0.42764811490125676,
"grad_norm": 0.40296608209609985,
"learning_rate": 9.94534932020137e-06,
"loss": 0.4488,
"step": 397
},
{
"epoch": 0.4287253141831239,
"grad_norm": 0.46213823556900024,
"learning_rate": 9.944420839716106e-06,
"loss": 0.4534,
"step": 398
},
{
"epoch": 0.42980251346499104,
"grad_norm": 0.5131990313529968,
"learning_rate": 9.943484582453653e-06,
"loss": 0.4459,
"step": 399
},
{
"epoch": 0.43087971274685816,
"grad_norm": 0.4590674340724945,
"learning_rate": 9.942540549886592e-06,
"loss": 0.4607,
"step": 400
},
{
"epoch": 0.4319569120287253,
"grad_norm": 0.4340755045413971,
"learning_rate": 9.941588743499734e-06,
"loss": 0.4537,
"step": 401
},
{
"epoch": 0.43303411131059244,
"grad_norm": 0.4147309958934784,
"learning_rate": 9.94062916479012e-06,
"loss": 0.4692,
"step": 402
},
{
"epoch": 0.4341113105924596,
"grad_norm": 0.42131778597831726,
"learning_rate": 9.939661815267008e-06,
"loss": 0.4442,
"step": 403
},
{
"epoch": 0.4351885098743267,
"grad_norm": 0.4066350758075714,
"learning_rate": 9.938686696451884e-06,
"loss": 0.4673,
"step": 404
},
{
"epoch": 0.4362657091561939,
"grad_norm": 0.42381957173347473,
"learning_rate": 9.937703809878455e-06,
"loss": 0.4596,
"step": 405
},
{
"epoch": 0.43734290843806106,
"grad_norm": 0.4123949408531189,
"learning_rate": 9.936713157092641e-06,
"loss": 0.4555,
"step": 406
},
{
"epoch": 0.4384201077199282,
"grad_norm": 0.45048898458480835,
"learning_rate": 9.93571473965258e-06,
"loss": 0.4704,
"step": 407
},
{
"epoch": 0.43949730700179535,
"grad_norm": 0.35981449484825134,
"learning_rate": 9.934708559128624e-06,
"loss": 0.4316,
"step": 408
},
{
"epoch": 0.44057450628366246,
"grad_norm": 0.4269760847091675,
"learning_rate": 9.933694617103328e-06,
"loss": 0.4453,
"step": 409
},
{
"epoch": 0.44165170556552963,
"grad_norm": 0.43287765979766846,
"learning_rate": 9.932672915171461e-06,
"loss": 0.4604,
"step": 410
},
{
"epoch": 0.44272890484739674,
"grad_norm": 0.4244833290576935,
"learning_rate": 9.93164345494e-06,
"loss": 0.4732,
"step": 411
},
{
"epoch": 0.4438061041292639,
"grad_norm": 0.38087978959083557,
"learning_rate": 9.930606238028116e-06,
"loss": 0.4557,
"step": 412
},
{
"epoch": 0.4448833034111311,
"grad_norm": 0.4723820686340332,
"learning_rate": 9.929561266067183e-06,
"loss": 0.4623,
"step": 413
},
{
"epoch": 0.4459605026929982,
"grad_norm": 0.4074157476425171,
"learning_rate": 9.928508540700775e-06,
"loss": 0.4358,
"step": 414
},
{
"epoch": 0.44703770197486536,
"grad_norm": 0.48810282349586487,
"learning_rate": 9.92744806358466e-06,
"loss": 0.479,
"step": 415
},
{
"epoch": 0.4481149012567325,
"grad_norm": 0.382062166929245,
"learning_rate": 9.926379836386796e-06,
"loss": 0.4684,
"step": 416
},
{
"epoch": 0.44919210053859965,
"grad_norm": 0.40181615948677063,
"learning_rate": 9.925303860787335e-06,
"loss": 0.4328,
"step": 417
},
{
"epoch": 0.45026929982046676,
"grad_norm": 0.41660112142562866,
"learning_rate": 9.924220138478612e-06,
"loss": 0.4642,
"step": 418
},
{
"epoch": 0.45134649910233393,
"grad_norm": 0.40994152426719666,
"learning_rate": 9.923128671165145e-06,
"loss": 0.441,
"step": 419
},
{
"epoch": 0.4524236983842011,
"grad_norm": 0.46039193868637085,
"learning_rate": 9.92202946056364e-06,
"loss": 0.4558,
"step": 420
},
{
"epoch": 0.4535008976660682,
"grad_norm": 0.40079423785209656,
"learning_rate": 9.920922508402975e-06,
"loss": 0.4417,
"step": 421
},
{
"epoch": 0.4545780969479354,
"grad_norm": 0.4265320897102356,
"learning_rate": 9.91980781642421e-06,
"loss": 0.4279,
"step": 422
},
{
"epoch": 0.4556552962298025,
"grad_norm": 0.4361076354980469,
"learning_rate": 9.918685386380575e-06,
"loss": 0.4563,
"step": 423
},
{
"epoch": 0.45673249551166967,
"grad_norm": 0.39475542306900024,
"learning_rate": 9.917555220037469e-06,
"loss": 0.4403,
"step": 424
},
{
"epoch": 0.4578096947935368,
"grad_norm": 0.403612345457077,
"learning_rate": 9.916417319172466e-06,
"loss": 0.4393,
"step": 425
},
{
"epoch": 0.45888689407540395,
"grad_norm": 0.40121960639953613,
"learning_rate": 9.915271685575297e-06,
"loss": 0.4616,
"step": 426
},
{
"epoch": 0.4599640933572711,
"grad_norm": 0.4377219080924988,
"learning_rate": 9.91411832104786e-06,
"loss": 0.4575,
"step": 427
},
{
"epoch": 0.46104129263913823,
"grad_norm": 0.44596531987190247,
"learning_rate": 9.912957227404215e-06,
"loss": 0.454,
"step": 428
},
{
"epoch": 0.4621184919210054,
"grad_norm": 0.4428754150867462,
"learning_rate": 9.91178840647057e-06,
"loss": 0.4486,
"step": 429
},
{
"epoch": 0.4631956912028725,
"grad_norm": 0.4418964385986328,
"learning_rate": 9.910611860085293e-06,
"loss": 0.4591,
"step": 430
},
{
"epoch": 0.4642728904847397,
"grad_norm": 0.4350813329219818,
"learning_rate": 9.909427590098905e-06,
"loss": 0.4604,
"step": 431
},
{
"epoch": 0.4653500897666068,
"grad_norm": 0.40184399485588074,
"learning_rate": 9.90823559837407e-06,
"loss": 0.4336,
"step": 432
},
{
"epoch": 0.46642728904847397,
"grad_norm": 0.5028787851333618,
"learning_rate": 9.907035886785597e-06,
"loss": 0.472,
"step": 433
},
{
"epoch": 0.46750448833034114,
"grad_norm": 0.46519935131073,
"learning_rate": 9.905828457220442e-06,
"loss": 0.4588,
"step": 434
},
{
"epoch": 0.46858168761220825,
"grad_norm": 0.3957783579826355,
"learning_rate": 9.904613311577696e-06,
"loss": 0.4605,
"step": 435
},
{
"epoch": 0.4696588868940754,
"grad_norm": 0.5078506469726562,
"learning_rate": 9.903390451768587e-06,
"loss": 0.4516,
"step": 436
},
{
"epoch": 0.47073608617594254,
"grad_norm": 0.4710095524787903,
"learning_rate": 9.902159879716475e-06,
"loss": 0.4706,
"step": 437
},
{
"epoch": 0.4718132854578097,
"grad_norm": 0.40339967608451843,
"learning_rate": 9.900921597356856e-06,
"loss": 0.4658,
"step": 438
},
{
"epoch": 0.4728904847396768,
"grad_norm": 0.4403778314590454,
"learning_rate": 9.899675606637344e-06,
"loss": 0.4394,
"step": 439
},
{
"epoch": 0.473967684021544,
"grad_norm": 0.4328095316886902,
"learning_rate": 9.898421909517684e-06,
"loss": 0.4669,
"step": 440
},
{
"epoch": 0.47504488330341116,
"grad_norm": 0.44412991404533386,
"learning_rate": 9.897160507969737e-06,
"loss": 0.4553,
"step": 441
},
{
"epoch": 0.4761220825852783,
"grad_norm": 0.4219338893890381,
"learning_rate": 9.895891403977489e-06,
"loss": 0.465,
"step": 442
},
{
"epoch": 0.47719928186714544,
"grad_norm": 0.4596662223339081,
"learning_rate": 9.894614599537032e-06,
"loss": 0.4562,
"step": 443
},
{
"epoch": 0.47827648114901256,
"grad_norm": 0.4500572383403778,
"learning_rate": 9.893330096656576e-06,
"loss": 0.4402,
"step": 444
},
{
"epoch": 0.4793536804308797,
"grad_norm": 0.405307799577713,
"learning_rate": 9.892037897356432e-06,
"loss": 0.4668,
"step": 445
},
{
"epoch": 0.48043087971274684,
"grad_norm": 0.4302726686000824,
"learning_rate": 9.890738003669029e-06,
"loss": 0.4451,
"step": 446
},
{
"epoch": 0.481508078994614,
"grad_norm": 0.4489511549472809,
"learning_rate": 9.889430417638886e-06,
"loss": 0.4393,
"step": 447
},
{
"epoch": 0.4825852782764811,
"grad_norm": 0.5107079744338989,
"learning_rate": 9.888115141322625e-06,
"loss": 0.451,
"step": 448
},
{
"epoch": 0.4836624775583483,
"grad_norm": 0.38805079460144043,
"learning_rate": 9.886792176788964e-06,
"loss": 0.4468,
"step": 449
},
{
"epoch": 0.48473967684021546,
"grad_norm": 0.5426356196403503,
"learning_rate": 9.885461526118713e-06,
"loss": 0.4754,
"step": 450
},
{
"epoch": 0.4858168761220826,
"grad_norm": 0.40643173456192017,
"learning_rate": 9.884123191404772e-06,
"loss": 0.4564,
"step": 451
},
{
"epoch": 0.48689407540394974,
"grad_norm": 0.48633506894111633,
"learning_rate": 9.882777174752128e-06,
"loss": 0.4621,
"step": 452
},
{
"epoch": 0.48797127468581686,
"grad_norm": 0.470723032951355,
"learning_rate": 9.881423478277841e-06,
"loss": 0.4296,
"step": 453
},
{
"epoch": 0.489048473967684,
"grad_norm": 0.4104878008365631,
"learning_rate": 9.880062104111064e-06,
"loss": 0.44,
"step": 454
},
{
"epoch": 0.49012567324955114,
"grad_norm": 0.4439231753349304,
"learning_rate": 9.878693054393017e-06,
"loss": 0.4728,
"step": 455
},
{
"epoch": 0.4912028725314183,
"grad_norm": 0.44275912642478943,
"learning_rate": 9.877316331276995e-06,
"loss": 0.4614,
"step": 456
},
{
"epoch": 0.4922800718132855,
"grad_norm": 0.4094824194908142,
"learning_rate": 9.87593193692836e-06,
"loss": 0.4441,
"step": 457
},
{
"epoch": 0.4933572710951526,
"grad_norm": 0.3908982574939728,
"learning_rate": 9.874539873524545e-06,
"loss": 0.4362,
"step": 458
},
{
"epoch": 0.49443447037701976,
"grad_norm": 0.47311311960220337,
"learning_rate": 9.873140143255035e-06,
"loss": 0.4788,
"step": 459
},
{
"epoch": 0.4955116696588869,
"grad_norm": 0.4665136933326721,
"learning_rate": 9.871732748321388e-06,
"loss": 0.4477,
"step": 460
},
{
"epoch": 0.49658886894075405,
"grad_norm": 0.4822944700717926,
"learning_rate": 9.870317690937204e-06,
"loss": 0.4447,
"step": 461
},
{
"epoch": 0.49766606822262116,
"grad_norm": 0.4455501437187195,
"learning_rate": 9.86889497332814e-06,
"loss": 0.4439,
"step": 462
},
{
"epoch": 0.49874326750448833,
"grad_norm": 0.545047402381897,
"learning_rate": 9.867464597731906e-06,
"loss": 0.4414,
"step": 463
},
{
"epoch": 0.4998204667863555,
"grad_norm": 0.43890202045440674,
"learning_rate": 9.866026566398248e-06,
"loss": 0.439,
"step": 464
},
{
"epoch": 0.5008976660682226,
"grad_norm": 0.4467841684818268,
"learning_rate": 9.864580881588958e-06,
"loss": 0.4621,
"step": 465
},
{
"epoch": 0.5019748653500897,
"grad_norm": 0.5091913342475891,
"learning_rate": 9.863127545577868e-06,
"loss": 0.4427,
"step": 466
},
{
"epoch": 0.503052064631957,
"grad_norm": 0.4745034873485565,
"learning_rate": 9.86166656065084e-06,
"loss": 0.4754,
"step": 467
},
{
"epoch": 0.5041292639138241,
"grad_norm": 0.4376344680786133,
"learning_rate": 9.860197929105769e-06,
"loss": 0.4786,
"step": 468
},
{
"epoch": 0.5052064631956912,
"grad_norm": 0.4569643437862396,
"learning_rate": 9.858721653252571e-06,
"loss": 0.4312,
"step": 469
},
{
"epoch": 0.5062836624775583,
"grad_norm": 0.482858806848526,
"learning_rate": 9.857237735413194e-06,
"loss": 0.4506,
"step": 470
},
{
"epoch": 0.5073608617594255,
"grad_norm": 0.4289652705192566,
"learning_rate": 9.855746177921602e-06,
"loss": 0.4607,
"step": 471
},
{
"epoch": 0.5084380610412926,
"grad_norm": 0.4449455440044403,
"learning_rate": 9.854246983123771e-06,
"loss": 0.4436,
"step": 472
},
{
"epoch": 0.5095152603231597,
"grad_norm": 0.4095325767993927,
"learning_rate": 9.852740153377698e-06,
"loss": 0.4474,
"step": 473
},
{
"epoch": 0.510592459605027,
"grad_norm": 0.45156896114349365,
"learning_rate": 9.851225691053382e-06,
"loss": 0.4492,
"step": 474
},
{
"epoch": 0.5116696588868941,
"grad_norm": 0.37197962403297424,
"learning_rate": 9.849703598532823e-06,
"loss": 0.4586,
"step": 475
},
{
"epoch": 0.5127468581687612,
"grad_norm": 0.44566208124160767,
"learning_rate": 9.848173878210034e-06,
"loss": 0.4366,
"step": 476
},
{
"epoch": 0.5138240574506283,
"grad_norm": 0.4585574269294739,
"learning_rate": 9.846636532491015e-06,
"loss": 0.4386,
"step": 477
},
{
"epoch": 0.5149012567324955,
"grad_norm": 0.48970794677734375,
"learning_rate": 9.845091563793763e-06,
"loss": 0.4569,
"step": 478
},
{
"epoch": 0.5159784560143627,
"grad_norm": 0.3991914987564087,
"learning_rate": 9.843538974548264e-06,
"loss": 0.4386,
"step": 479
},
{
"epoch": 0.5170556552962298,
"grad_norm": 0.5011213421821594,
"learning_rate": 9.841978767196495e-06,
"loss": 0.4517,
"step": 480
},
{
"epoch": 0.518132854578097,
"grad_norm": 0.4288571774959564,
"learning_rate": 9.840410944192407e-06,
"loss": 0.4492,
"step": 481
},
{
"epoch": 0.5192100538599641,
"grad_norm": 0.5407549142837524,
"learning_rate": 9.838835508001934e-06,
"loss": 0.4804,
"step": 482
},
{
"epoch": 0.5202872531418312,
"grad_norm": 0.41034746170043945,
"learning_rate": 9.837252461102981e-06,
"loss": 0.4582,
"step": 483
},
{
"epoch": 0.5213644524236983,
"grad_norm": 0.4609270989894867,
"learning_rate": 9.835661805985432e-06,
"loss": 0.4506,
"step": 484
},
{
"epoch": 0.5224416517055656,
"grad_norm": 0.3973561227321625,
"learning_rate": 9.834063545151125e-06,
"loss": 0.4453,
"step": 485
},
{
"epoch": 0.5235188509874327,
"grad_norm": 0.3957566022872925,
"learning_rate": 9.832457681113867e-06,
"loss": 0.4625,
"step": 486
},
{
"epoch": 0.5245960502692998,
"grad_norm": 0.427836537361145,
"learning_rate": 9.830844216399426e-06,
"loss": 0.4334,
"step": 487
},
{
"epoch": 0.525673249551167,
"grad_norm": 0.39041754603385925,
"learning_rate": 9.829223153545522e-06,
"loss": 0.4597,
"step": 488
},
{
"epoch": 0.5267504488330341,
"grad_norm": 0.37624824047088623,
"learning_rate": 9.827594495101824e-06,
"loss": 0.4366,
"step": 489
},
{
"epoch": 0.5278276481149012,
"grad_norm": 0.47729989886283875,
"learning_rate": 9.825958243629951e-06,
"loss": 0.419,
"step": 490
},
{
"epoch": 0.5289048473967684,
"grad_norm": 0.37915152311325073,
"learning_rate": 9.824314401703461e-06,
"loss": 0.4453,
"step": 491
},
{
"epoch": 0.5299820466786356,
"grad_norm": 0.36883866786956787,
"learning_rate": 9.822662971907853e-06,
"loss": 0.4203,
"step": 492
},
{
"epoch": 0.5310592459605027,
"grad_norm": 0.4049866497516632,
"learning_rate": 9.82100395684056e-06,
"loss": 0.4332,
"step": 493
},
{
"epoch": 0.5321364452423698,
"grad_norm": 0.399457186460495,
"learning_rate": 9.819337359110945e-06,
"loss": 0.4825,
"step": 494
},
{
"epoch": 0.533213644524237,
"grad_norm": 0.34586960077285767,
"learning_rate": 9.8176631813403e-06,
"loss": 0.4381,
"step": 495
},
{
"epoch": 0.5342908438061041,
"grad_norm": 0.39063382148742676,
"learning_rate": 9.815981426161834e-06,
"loss": 0.4538,
"step": 496
},
{
"epoch": 0.5353680430879713,
"grad_norm": 0.3408878445625305,
"learning_rate": 9.81429209622068e-06,
"loss": 0.4722,
"step": 497
},
{
"epoch": 0.5364452423698384,
"grad_norm": 0.3885675370693207,
"learning_rate": 9.812595194173875e-06,
"loss": 0.4577,
"step": 498
},
{
"epoch": 0.5375224416517056,
"grad_norm": 0.36339253187179565,
"learning_rate": 9.81089072269038e-06,
"loss": 0.4465,
"step": 499
},
{
"epoch": 0.5385996409335727,
"grad_norm": 0.37926578521728516,
"learning_rate": 9.809178684451052e-06,
"loss": 0.4482,
"step": 500
},
{
"epoch": 0.5396768402154398,
"grad_norm": 0.370346337556839,
"learning_rate": 9.807459082148648e-06,
"loss": 0.4602,
"step": 501
},
{
"epoch": 0.540754039497307,
"grad_norm": 0.42454493045806885,
"learning_rate": 9.805731918487832e-06,
"loss": 0.4682,
"step": 502
},
{
"epoch": 0.5418312387791742,
"grad_norm": 0.3915000259876251,
"learning_rate": 9.803997196185146e-06,
"loss": 0.4494,
"step": 503
},
{
"epoch": 0.5429084380610413,
"grad_norm": 0.43712353706359863,
"learning_rate": 9.802254917969033e-06,
"loss": 0.4652,
"step": 504
},
{
"epoch": 0.5439856373429084,
"grad_norm": 0.45863014459609985,
"learning_rate": 9.800505086579816e-06,
"loss": 0.4262,
"step": 505
},
{
"epoch": 0.5450628366247756,
"grad_norm": 0.39945051074028015,
"learning_rate": 9.798747704769696e-06,
"loss": 0.4586,
"step": 506
},
{
"epoch": 0.5461400359066427,
"grad_norm": 0.4776397943496704,
"learning_rate": 9.796982775302755e-06,
"loss": 0.4656,
"step": 507
},
{
"epoch": 0.5472172351885098,
"grad_norm": 0.430626779794693,
"learning_rate": 9.795210300954938e-06,
"loss": 0.457,
"step": 508
},
{
"epoch": 0.5482944344703771,
"grad_norm": 0.4296889007091522,
"learning_rate": 9.793430284514063e-06,
"loss": 0.4268,
"step": 509
},
{
"epoch": 0.5493716337522442,
"grad_norm": 0.4044749140739441,
"learning_rate": 9.79164272877981e-06,
"loss": 0.46,
"step": 510
},
{
"epoch": 0.5504488330341113,
"grad_norm": 0.4012002646923065,
"learning_rate": 9.789847636563718e-06,
"loss": 0.439,
"step": 511
},
{
"epoch": 0.5515260323159784,
"grad_norm": 0.396487295627594,
"learning_rate": 9.788045010689173e-06,
"loss": 0.442,
"step": 512
},
{
"epoch": 0.5526032315978456,
"grad_norm": 0.3827875554561615,
"learning_rate": 9.786234853991419e-06,
"loss": 0.4548,
"step": 513
},
{
"epoch": 0.5536804308797127,
"grad_norm": 0.42587071657180786,
"learning_rate": 9.78441716931754e-06,
"loss": 0.4319,
"step": 514
},
{
"epoch": 0.5547576301615799,
"grad_norm": 0.4159460961818695,
"learning_rate": 9.782591959526457e-06,
"loss": 0.434,
"step": 515
},
{
"epoch": 0.5558348294434471,
"grad_norm": 0.4378649890422821,
"learning_rate": 9.780759227488937e-06,
"loss": 0.4422,
"step": 516
},
{
"epoch": 0.5569120287253142,
"grad_norm": 0.3905577063560486,
"learning_rate": 9.77891897608757e-06,
"loss": 0.4403,
"step": 517
},
{
"epoch": 0.5579892280071813,
"grad_norm": 0.44301673769950867,
"learning_rate": 9.777071208216772e-06,
"loss": 0.4522,
"step": 518
},
{
"epoch": 0.5590664272890484,
"grad_norm": 0.40776389837265015,
"learning_rate": 9.775215926782788e-06,
"loss": 0.4511,
"step": 519
},
{
"epoch": 0.5601436265709157,
"grad_norm": 0.44333213567733765,
"learning_rate": 9.773353134703675e-06,
"loss": 0.4624,
"step": 520
},
{
"epoch": 0.5612208258527828,
"grad_norm": 0.44751521944999695,
"learning_rate": 9.771482834909306e-06,
"loss": 0.4542,
"step": 521
},
{
"epoch": 0.5622980251346499,
"grad_norm": 0.4059796631336212,
"learning_rate": 9.769605030341356e-06,
"loss": 0.4471,
"step": 522
},
{
"epoch": 0.5633752244165171,
"grad_norm": 0.44883087277412415,
"learning_rate": 9.767719723953315e-06,
"loss": 0.4492,
"step": 523
},
{
"epoch": 0.5644524236983842,
"grad_norm": 0.40039992332458496,
"learning_rate": 9.765826918710466e-06,
"loss": 0.4455,
"step": 524
},
{
"epoch": 0.5655296229802513,
"grad_norm": 0.44426193833351135,
"learning_rate": 9.763926617589883e-06,
"loss": 0.4551,
"step": 525
},
{
"epoch": 0.5666068222621184,
"grad_norm": 0.4035399854183197,
"learning_rate": 9.762018823580436e-06,
"loss": 0.4665,
"step": 526
},
{
"epoch": 0.5676840215439857,
"grad_norm": 0.4453595280647278,
"learning_rate": 9.760103539682777e-06,
"loss": 0.4653,
"step": 527
},
{
"epoch": 0.5687612208258528,
"grad_norm": 0.37671253085136414,
"learning_rate": 9.758180768909338e-06,
"loss": 0.4205,
"step": 528
},
{
"epoch": 0.5698384201077199,
"grad_norm": 0.42255714535713196,
"learning_rate": 9.75625051428433e-06,
"loss": 0.4572,
"step": 529
},
{
"epoch": 0.5709156193895871,
"grad_norm": 0.4075011909008026,
"learning_rate": 9.754312778843727e-06,
"loss": 0.4314,
"step": 530
},
{
"epoch": 0.5719928186714542,
"grad_norm": 0.5155799984931946,
"learning_rate": 9.752367565635281e-06,
"loss": 0.4794,
"step": 531
},
{
"epoch": 0.5730700179533214,
"grad_norm": 0.41437146067619324,
"learning_rate": 9.750414877718495e-06,
"loss": 0.4477,
"step": 532
},
{
"epoch": 0.5741472172351885,
"grad_norm": 0.5376664996147156,
"learning_rate": 9.748454718164635e-06,
"loss": 0.4369,
"step": 533
},
{
"epoch": 0.5752244165170557,
"grad_norm": 0.3836204707622528,
"learning_rate": 9.746487090056712e-06,
"loss": 0.4521,
"step": 534
},
{
"epoch": 0.5763016157989228,
"grad_norm": 0.4510630667209625,
"learning_rate": 9.744511996489495e-06,
"loss": 0.446,
"step": 535
},
{
"epoch": 0.5773788150807899,
"grad_norm": 0.40221360325813293,
"learning_rate": 9.742529440569481e-06,
"loss": 0.4308,
"step": 536
},
{
"epoch": 0.5784560143626571,
"grad_norm": 0.46302559971809387,
"learning_rate": 9.740539425414913e-06,
"loss": 0.458,
"step": 537
},
{
"epoch": 0.5795332136445243,
"grad_norm": 0.405277818441391,
"learning_rate": 9.738541954155766e-06,
"loss": 0.4406,
"step": 538
},
{
"epoch": 0.5806104129263914,
"grad_norm": 0.4966040849685669,
"learning_rate": 9.736537029933738e-06,
"loss": 0.4403,
"step": 539
},
{
"epoch": 0.5816876122082585,
"grad_norm": 0.45864376425743103,
"learning_rate": 9.734524655902253e-06,
"loss": 0.4404,
"step": 540
},
{
"epoch": 0.5827648114901257,
"grad_norm": 0.4572237730026245,
"learning_rate": 9.732504835226451e-06,
"loss": 0.4396,
"step": 541
},
{
"epoch": 0.5838420107719928,
"grad_norm": 0.5045959949493408,
"learning_rate": 9.730477571083184e-06,
"loss": 0.4383,
"step": 542
},
{
"epoch": 0.5849192100538599,
"grad_norm": 0.5284016132354736,
"learning_rate": 9.728442866661013e-06,
"loss": 0.4508,
"step": 543
},
{
"epoch": 0.5859964093357272,
"grad_norm": 0.39870405197143555,
"learning_rate": 9.726400725160199e-06,
"loss": 0.45,
"step": 544
},
{
"epoch": 0.5870736086175943,
"grad_norm": 0.48833194375038147,
"learning_rate": 9.724351149792702e-06,
"loss": 0.4431,
"step": 545
},
{
"epoch": 0.5881508078994614,
"grad_norm": 0.577894926071167,
"learning_rate": 9.722294143782171e-06,
"loss": 0.4517,
"step": 546
},
{
"epoch": 0.5892280071813285,
"grad_norm": 0.3885682225227356,
"learning_rate": 9.720229710363949e-06,
"loss": 0.4537,
"step": 547
},
{
"epoch": 0.5903052064631957,
"grad_norm": 0.42628440260887146,
"learning_rate": 9.718157852785057e-06,
"loss": 0.4396,
"step": 548
},
{
"epoch": 0.5913824057450628,
"grad_norm": 0.5009000897407532,
"learning_rate": 9.71607857430419e-06,
"loss": 0.4676,
"step": 549
},
{
"epoch": 0.59245960502693,
"grad_norm": 0.39723867177963257,
"learning_rate": 9.71399187819172e-06,
"loss": 0.4522,
"step": 550
},
{
"epoch": 0.5935368043087971,
"grad_norm": 0.359291136264801,
"learning_rate": 9.711897767729683e-06,
"loss": 0.4578,
"step": 551
},
{
"epoch": 0.5946140035906643,
"grad_norm": 0.4119752049446106,
"learning_rate": 9.709796246211778e-06,
"loss": 0.4743,
"step": 552
},
{
"epoch": 0.5956912028725314,
"grad_norm": 0.4480607509613037,
"learning_rate": 9.707687316943359e-06,
"loss": 0.4669,
"step": 553
},
{
"epoch": 0.5967684021543985,
"grad_norm": 0.3882138133049011,
"learning_rate": 9.705570983241433e-06,
"loss": 0.4394,
"step": 554
},
{
"epoch": 0.5978456014362658,
"grad_norm": 0.4053027927875519,
"learning_rate": 9.70344724843465e-06,
"loss": 0.4286,
"step": 555
},
{
"epoch": 0.5989228007181329,
"grad_norm": 0.40790650248527527,
"learning_rate": 9.701316115863304e-06,
"loss": 0.416,
"step": 556
},
{
"epoch": 0.6,
"grad_norm": 0.40172263979911804,
"learning_rate": 9.699177588879323e-06,
"loss": 0.4298,
"step": 557
},
{
"epoch": 0.6010771992818671,
"grad_norm": 0.4236910343170166,
"learning_rate": 9.697031670846266e-06,
"loss": 0.4699,
"step": 558
},
{
"epoch": 0.6021543985637343,
"grad_norm": 0.48015671968460083,
"learning_rate": 9.694878365139313e-06,
"loss": 0.4487,
"step": 559
},
{
"epoch": 0.6032315978456014,
"grad_norm": 0.4922192096710205,
"learning_rate": 9.69271767514527e-06,
"loss": 0.448,
"step": 560
},
{
"epoch": 0.6043087971274685,
"grad_norm": 0.36672329902648926,
"learning_rate": 9.690549604262556e-06,
"loss": 0.4333,
"step": 561
},
{
"epoch": 0.6053859964093358,
"grad_norm": 0.5522063970565796,
"learning_rate": 9.688374155901192e-06,
"loss": 0.4629,
"step": 562
},
{
"epoch": 0.6064631956912029,
"grad_norm": 0.4292784333229065,
"learning_rate": 9.68619133348281e-06,
"loss": 0.4395,
"step": 563
},
{
"epoch": 0.60754039497307,
"grad_norm": 0.5118968486785889,
"learning_rate": 9.68400114044064e-06,
"loss": 0.4449,
"step": 564
},
{
"epoch": 0.6086175942549371,
"grad_norm": 0.47617998719215393,
"learning_rate": 9.6818035802195e-06,
"loss": 0.4261,
"step": 565
},
{
"epoch": 0.6096947935368043,
"grad_norm": 0.4254116415977478,
"learning_rate": 9.679598656275797e-06,
"loss": 0.4502,
"step": 566
},
{
"epoch": 0.6107719928186714,
"grad_norm": 0.41793403029441833,
"learning_rate": 9.677386372077525e-06,
"loss": 0.465,
"step": 567
},
{
"epoch": 0.6118491921005386,
"grad_norm": 0.4164504408836365,
"learning_rate": 9.67516673110425e-06,
"loss": 0.4436,
"step": 568
},
{
"epoch": 0.6129263913824058,
"grad_norm": 0.38877761363983154,
"learning_rate": 9.672939736847104e-06,
"loss": 0.4471,
"step": 569
},
{
"epoch": 0.6140035906642729,
"grad_norm": 0.4273861050605774,
"learning_rate": 9.670705392808796e-06,
"loss": 0.4663,
"step": 570
},
{
"epoch": 0.61508078994614,
"grad_norm": 0.5441392660140991,
"learning_rate": 9.668463702503588e-06,
"loss": 0.4682,
"step": 571
},
{
"epoch": 0.6161579892280071,
"grad_norm": 0.3736647963523865,
"learning_rate": 9.666214669457295e-06,
"loss": 0.4447,
"step": 572
},
{
"epoch": 0.6172351885098744,
"grad_norm": 0.5223665237426758,
"learning_rate": 9.663958297207286e-06,
"loss": 0.4553,
"step": 573
},
{
"epoch": 0.6183123877917415,
"grad_norm": 0.4206368327140808,
"learning_rate": 9.661694589302471e-06,
"loss": 0.4547,
"step": 574
},
{
"epoch": 0.6193895870736086,
"grad_norm": 0.4165674149990082,
"learning_rate": 9.659423549303298e-06,
"loss": 0.4339,
"step": 575
},
{
"epoch": 0.6204667863554758,
"grad_norm": 0.3881623446941376,
"learning_rate": 9.657145180781745e-06,
"loss": 0.4483,
"step": 576
},
{
"epoch": 0.6215439856373429,
"grad_norm": 0.44433334469795227,
"learning_rate": 9.654859487321318e-06,
"loss": 0.4381,
"step": 577
},
{
"epoch": 0.62262118491921,
"grad_norm": 0.38431665301322937,
"learning_rate": 9.652566472517048e-06,
"loss": 0.4436,
"step": 578
},
{
"epoch": 0.6236983842010771,
"grad_norm": 0.3904261887073517,
"learning_rate": 9.650266139975474e-06,
"loss": 0.4563,
"step": 579
},
{
"epoch": 0.6247755834829444,
"grad_norm": 0.450923353433609,
"learning_rate": 9.647958493314653e-06,
"loss": 0.4458,
"step": 580
},
{
"epoch": 0.6258527827648115,
"grad_norm": 0.4171711504459381,
"learning_rate": 9.645643536164136e-06,
"loss": 0.4604,
"step": 581
},
{
"epoch": 0.6269299820466786,
"grad_norm": 0.44739070534706116,
"learning_rate": 9.643321272164981e-06,
"loss": 0.4468,
"step": 582
},
{
"epoch": 0.6280071813285458,
"grad_norm": 0.4494309425354004,
"learning_rate": 9.640991704969735e-06,
"loss": 0.4547,
"step": 583
},
{
"epoch": 0.6290843806104129,
"grad_norm": 0.3796863853931427,
"learning_rate": 9.63865483824243e-06,
"loss": 0.4258,
"step": 584
},
{
"epoch": 0.63016157989228,
"grad_norm": 0.4536336660385132,
"learning_rate": 9.63631067565858e-06,
"loss": 0.4642,
"step": 585
},
{
"epoch": 0.6312387791741472,
"grad_norm": 0.37764403223991394,
"learning_rate": 9.633959220905179e-06,
"loss": 0.4268,
"step": 586
},
{
"epoch": 0.6323159784560144,
"grad_norm": 0.3929099142551422,
"learning_rate": 9.631600477680683e-06,
"loss": 0.4421,
"step": 587
},
{
"epoch": 0.6333931777378815,
"grad_norm": 0.4072980284690857,
"learning_rate": 9.629234449695015e-06,
"loss": 0.4495,
"step": 588
},
{
"epoch": 0.6344703770197486,
"grad_norm": 0.4604843556880951,
"learning_rate": 9.62686114066956e-06,
"loss": 0.4872,
"step": 589
},
{
"epoch": 0.6355475763016158,
"grad_norm": 0.41248437762260437,
"learning_rate": 9.624480554337144e-06,
"loss": 0.4375,
"step": 590
},
{
"epoch": 0.636624775583483,
"grad_norm": 0.3883218467235565,
"learning_rate": 9.62209269444205e-06,
"loss": 0.4502,
"step": 591
},
{
"epoch": 0.6377019748653501,
"grad_norm": 0.44741594791412354,
"learning_rate": 9.619697564739996e-06,
"loss": 0.4515,
"step": 592
},
{
"epoch": 0.6387791741472172,
"grad_norm": 0.4136967658996582,
"learning_rate": 9.617295168998135e-06,
"loss": 0.4518,
"step": 593
},
{
"epoch": 0.6398563734290844,
"grad_norm": 0.4265954792499542,
"learning_rate": 9.614885510995047e-06,
"loss": 0.4418,
"step": 594
},
{
"epoch": 0.6409335727109515,
"grad_norm": 0.4041872024536133,
"learning_rate": 9.612468594520739e-06,
"loss": 0.4533,
"step": 595
},
{
"epoch": 0.6420107719928186,
"grad_norm": 0.4049215018749237,
"learning_rate": 9.610044423376628e-06,
"loss": 0.4142,
"step": 596
},
{
"epoch": 0.6430879712746859,
"grad_norm": 0.4189813435077667,
"learning_rate": 9.607613001375546e-06,
"loss": 0.4374,
"step": 597
},
{
"epoch": 0.644165170556553,
"grad_norm": 0.4443398118019104,
"learning_rate": 9.605174332341728e-06,
"loss": 0.4228,
"step": 598
},
{
"epoch": 0.6452423698384201,
"grad_norm": 0.499967485666275,
"learning_rate": 9.602728420110807e-06,
"loss": 0.4484,
"step": 599
},
{
"epoch": 0.6463195691202872,
"grad_norm": 0.4229572117328644,
"learning_rate": 9.600275268529809e-06,
"loss": 0.4619,
"step": 600
},
{
"epoch": 0.6473967684021544,
"grad_norm": 0.4665996730327606,
"learning_rate": 9.597814881457144e-06,
"loss": 0.4589,
"step": 601
},
{
"epoch": 0.6484739676840215,
"grad_norm": 0.39410340785980225,
"learning_rate": 9.595347262762608e-06,
"loss": 0.4551,
"step": 602
},
{
"epoch": 0.6495511669658887,
"grad_norm": 0.37037867307662964,
"learning_rate": 9.592872416327366e-06,
"loss": 0.4338,
"step": 603
},
{
"epoch": 0.6506283662477559,
"grad_norm": 0.38161227107048035,
"learning_rate": 9.590390346043952e-06,
"loss": 0.4318,
"step": 604
},
{
"epoch": 0.651705565529623,
"grad_norm": 0.3696405291557312,
"learning_rate": 9.587901055816262e-06,
"loss": 0.4327,
"step": 605
},
{
"epoch": 0.6527827648114901,
"grad_norm": 0.398960143327713,
"learning_rate": 9.585404549559551e-06,
"loss": 0.4447,
"step": 606
},
{
"epoch": 0.6538599640933572,
"grad_norm": 0.3864385485649109,
"learning_rate": 9.582900831200419e-06,
"loss": 0.443,
"step": 607
},
{
"epoch": 0.6549371633752245,
"grad_norm": 0.3866807818412781,
"learning_rate": 9.580389904676813e-06,
"loss": 0.4329,
"step": 608
},
{
"epoch": 0.6560143626570916,
"grad_norm": 0.37632474303245544,
"learning_rate": 9.577871773938013e-06,
"loss": 0.4549,
"step": 609
},
{
"epoch": 0.6570915619389587,
"grad_norm": 0.3991330862045288,
"learning_rate": 9.575346442944635e-06,
"loss": 0.456,
"step": 610
},
{
"epoch": 0.6581687612208259,
"grad_norm": 0.44957804679870605,
"learning_rate": 9.572813915668618e-06,
"loss": 0.4751,
"step": 611
},
{
"epoch": 0.659245960502693,
"grad_norm": 0.40252184867858887,
"learning_rate": 9.570274196093217e-06,
"loss": 0.4439,
"step": 612
},
{
"epoch": 0.6603231597845601,
"grad_norm": 0.45728814601898193,
"learning_rate": 9.567727288213005e-06,
"loss": 0.4497,
"step": 613
},
{
"epoch": 0.6614003590664272,
"grad_norm": 0.40386101603507996,
"learning_rate": 9.565173196033855e-06,
"loss": 0.4626,
"step": 614
},
{
"epoch": 0.6624775583482945,
"grad_norm": 0.3945240378379822,
"learning_rate": 9.562611923572944e-06,
"loss": 0.4252,
"step": 615
},
{
"epoch": 0.6635547576301616,
"grad_norm": 0.38906341791152954,
"learning_rate": 9.56004347485874e-06,
"loss": 0.4333,
"step": 616
},
{
"epoch": 0.6646319569120287,
"grad_norm": 0.3659195303916931,
"learning_rate": 9.557467853931e-06,
"loss": 0.4405,
"step": 617
},
{
"epoch": 0.6657091561938959,
"grad_norm": 0.41885197162628174,
"learning_rate": 9.554885064840758e-06,
"loss": 0.4554,
"step": 618
},
{
"epoch": 0.666786355475763,
"grad_norm": 0.4376233220100403,
"learning_rate": 9.552295111650328e-06,
"loss": 0.4395,
"step": 619
},
{
"epoch": 0.6678635547576302,
"grad_norm": 0.4020839035511017,
"learning_rate": 9.549697998433286e-06,
"loss": 0.4556,
"step": 620
},
{
"epoch": 0.6689407540394973,
"grad_norm": 0.4188641309738159,
"learning_rate": 9.547093729274474e-06,
"loss": 0.4584,
"step": 621
},
{
"epoch": 0.6700179533213645,
"grad_norm": 0.4054971933364868,
"learning_rate": 9.54448230826999e-06,
"loss": 0.436,
"step": 622
},
{
"epoch": 0.6710951526032316,
"grad_norm": 0.3803367614746094,
"learning_rate": 9.541863739527176e-06,
"loss": 0.4861,
"step": 623
},
{
"epoch": 0.6721723518850987,
"grad_norm": 0.43850457668304443,
"learning_rate": 9.539238027164618e-06,
"loss": 0.4438,
"step": 624
},
{
"epoch": 0.6732495511669659,
"grad_norm": 0.38866519927978516,
"learning_rate": 9.536605175312142e-06,
"loss": 0.4366,
"step": 625
},
{
"epoch": 0.6743267504488331,
"grad_norm": 0.4269110858440399,
"learning_rate": 9.533965188110794e-06,
"loss": 0.4523,
"step": 626
},
{
"epoch": 0.6754039497307002,
"grad_norm": 0.4612042307853699,
"learning_rate": 9.531318069712854e-06,
"loss": 0.4544,
"step": 627
},
{
"epoch": 0.6764811490125673,
"grad_norm": 0.340231329202652,
"learning_rate": 9.52866382428181e-06,
"loss": 0.4434,
"step": 628
},
{
"epoch": 0.6775583482944345,
"grad_norm": 0.4424141049385071,
"learning_rate": 9.526002455992361e-06,
"loss": 0.43,
"step": 629
},
{
"epoch": 0.6786355475763016,
"grad_norm": 0.4322042167186737,
"learning_rate": 9.523333969030413e-06,
"loss": 0.4661,
"step": 630
},
{
"epoch": 0.6797127468581687,
"grad_norm": 0.4067363142967224,
"learning_rate": 9.520658367593065e-06,
"loss": 0.4607,
"step": 631
},
{
"epoch": 0.680789946140036,
"grad_norm": 0.5061907768249512,
"learning_rate": 9.517975655888607e-06,
"loss": 0.4365,
"step": 632
},
{
"epoch": 0.6818671454219031,
"grad_norm": 0.3870816230773926,
"learning_rate": 9.51528583813651e-06,
"loss": 0.4456,
"step": 633
},
{
"epoch": 0.6829443447037702,
"grad_norm": 0.4361846446990967,
"learning_rate": 9.512588918567429e-06,
"loss": 0.435,
"step": 634
},
{
"epoch": 0.6840215439856373,
"grad_norm": 0.4258888065814972,
"learning_rate": 9.509884901423179e-06,
"loss": 0.4534,
"step": 635
},
{
"epoch": 0.6850987432675045,
"grad_norm": 0.42448312044143677,
"learning_rate": 9.507173790956746e-06,
"loss": 0.4447,
"step": 636
},
{
"epoch": 0.6861759425493716,
"grad_norm": 0.40607163310050964,
"learning_rate": 9.504455591432269e-06,
"loss": 0.4459,
"step": 637
},
{
"epoch": 0.6872531418312388,
"grad_norm": 0.4292491376399994,
"learning_rate": 9.501730307125037e-06,
"loss": 0.4508,
"step": 638
},
{
"epoch": 0.6883303411131059,
"grad_norm": 0.417349636554718,
"learning_rate": 9.498997942321484e-06,
"loss": 0.4363,
"step": 639
},
{
"epoch": 0.6894075403949731,
"grad_norm": 0.4843739867210388,
"learning_rate": 9.496258501319178e-06,
"loss": 0.4408,
"step": 640
},
{
"epoch": 0.6904847396768402,
"grad_norm": 0.49764111638069153,
"learning_rate": 9.493511988426822e-06,
"loss": 0.4279,
"step": 641
},
{
"epoch": 0.6915619389587073,
"grad_norm": 0.4440729022026062,
"learning_rate": 9.490758407964235e-06,
"loss": 0.4249,
"step": 642
},
{
"epoch": 0.6926391382405745,
"grad_norm": 0.48478931188583374,
"learning_rate": 9.487997764262356e-06,
"loss": 0.4678,
"step": 643
},
{
"epoch": 0.6937163375224417,
"grad_norm": 0.44857296347618103,
"learning_rate": 9.48523006166323e-06,
"loss": 0.4336,
"step": 644
},
{
"epoch": 0.6947935368043088,
"grad_norm": 0.4317927956581116,
"learning_rate": 9.482455304520013e-06,
"loss": 0.4601,
"step": 645
},
{
"epoch": 0.6958707360861759,
"grad_norm": 0.46467840671539307,
"learning_rate": 9.479673497196947e-06,
"loss": 0.452,
"step": 646
},
{
"epoch": 0.6969479353680431,
"grad_norm": 0.4180663526058197,
"learning_rate": 9.476884644069366e-06,
"loss": 0.4407,
"step": 647
},
{
"epoch": 0.6980251346499102,
"grad_norm": 0.44766995310783386,
"learning_rate": 9.474088749523689e-06,
"loss": 0.4613,
"step": 648
},
{
"epoch": 0.6991023339317773,
"grad_norm": 0.4251432418823242,
"learning_rate": 9.471285817957407e-06,
"loss": 0.4254,
"step": 649
},
{
"epoch": 0.7001795332136446,
"grad_norm": 0.4335974156856537,
"learning_rate": 9.468475853779078e-06,
"loss": 0.4205,
"step": 650
},
{
"epoch": 0.7012567324955117,
"grad_norm": 0.4041954576969147,
"learning_rate": 9.465658861408325e-06,
"loss": 0.4483,
"step": 651
},
{
"epoch": 0.7023339317773788,
"grad_norm": 0.40664151310920715,
"learning_rate": 9.462834845275821e-06,
"loss": 0.4296,
"step": 652
},
{
"epoch": 0.7034111310592459,
"grad_norm": 0.4748784005641937,
"learning_rate": 9.460003809823289e-06,
"loss": 0.4431,
"step": 653
},
{
"epoch": 0.7044883303411131,
"grad_norm": 0.4240473806858063,
"learning_rate": 9.457165759503492e-06,
"loss": 0.4133,
"step": 654
},
{
"epoch": 0.7055655296229802,
"grad_norm": 0.4704611003398895,
"learning_rate": 9.454320698780226e-06,
"loss": 0.4311,
"step": 655
},
{
"epoch": 0.7066427289048474,
"grad_norm": 0.4679224193096161,
"learning_rate": 9.451468632128313e-06,
"loss": 0.4565,
"step": 656
},
{
"epoch": 0.7077199281867146,
"grad_norm": 0.3749702274799347,
"learning_rate": 9.448609564033594e-06,
"loss": 0.4539,
"step": 657
},
{
"epoch": 0.7087971274685817,
"grad_norm": 0.4322480261325836,
"learning_rate": 9.445743498992921e-06,
"loss": 0.4357,
"step": 658
},
{
"epoch": 0.7098743267504488,
"grad_norm": 0.502750039100647,
"learning_rate": 9.442870441514155e-06,
"loss": 0.4527,
"step": 659
},
{
"epoch": 0.7109515260323159,
"grad_norm": 0.39252883195877075,
"learning_rate": 9.439990396116149e-06,
"loss": 0.4479,
"step": 660
},
{
"epoch": 0.7120287253141832,
"grad_norm": 0.505723774433136,
"learning_rate": 9.437103367328755e-06,
"loss": 0.4363,
"step": 661
},
{
"epoch": 0.7131059245960503,
"grad_norm": 0.44664621353149414,
"learning_rate": 9.4342093596928e-06,
"loss": 0.4306,
"step": 662
},
{
"epoch": 0.7141831238779174,
"grad_norm": 0.450601190328598,
"learning_rate": 9.431308377760095e-06,
"loss": 0.4842,
"step": 663
},
{
"epoch": 0.7152603231597846,
"grad_norm": 0.4185259938240051,
"learning_rate": 9.428400426093413e-06,
"loss": 0.4622,
"step": 664
},
{
"epoch": 0.7163375224416517,
"grad_norm": 0.49219274520874023,
"learning_rate": 9.425485509266497e-06,
"loss": 0.4521,
"step": 665
},
{
"epoch": 0.7174147217235188,
"grad_norm": 0.45771172642707825,
"learning_rate": 9.42256363186404e-06,
"loss": 0.4623,
"step": 666
},
{
"epoch": 0.718491921005386,
"grad_norm": 0.4976024329662323,
"learning_rate": 9.419634798481681e-06,
"loss": 0.47,
"step": 667
},
{
"epoch": 0.7195691202872532,
"grad_norm": 0.43556949496269226,
"learning_rate": 9.41669901372601e-06,
"loss": 0.4329,
"step": 668
},
{
"epoch": 0.7206463195691203,
"grad_norm": 0.4830540120601654,
"learning_rate": 9.413756282214538e-06,
"loss": 0.4859,
"step": 669
},
{
"epoch": 0.7217235188509874,
"grad_norm": 0.3960484266281128,
"learning_rate": 9.41080660857571e-06,
"loss": 0.4528,
"step": 670
},
{
"epoch": 0.7228007181328546,
"grad_norm": 0.4806577265262604,
"learning_rate": 9.407849997448884e-06,
"loss": 0.445,
"step": 671
},
{
"epoch": 0.7238779174147217,
"grad_norm": 0.40394070744514465,
"learning_rate": 9.404886453484337e-06,
"loss": 0.4552,
"step": 672
},
{
"epoch": 0.7249551166965889,
"grad_norm": 0.3905143141746521,
"learning_rate": 9.401915981343243e-06,
"loss": 0.4343,
"step": 673
},
{
"epoch": 0.726032315978456,
"grad_norm": 0.37571895122528076,
"learning_rate": 9.398938585697679e-06,
"loss": 0.4168,
"step": 674
},
{
"epoch": 0.7271095152603232,
"grad_norm": 0.40112486481666565,
"learning_rate": 9.395954271230606e-06,
"loss": 0.453,
"step": 675
},
{
"epoch": 0.7281867145421903,
"grad_norm": 0.4205509126186371,
"learning_rate": 9.39296304263587e-06,
"loss": 0.4406,
"step": 676
},
{
"epoch": 0.7292639138240574,
"grad_norm": 0.3572981059551239,
"learning_rate": 9.38996490461819e-06,
"loss": 0.4394,
"step": 677
},
{
"epoch": 0.7303411131059246,
"grad_norm": 0.39409223198890686,
"learning_rate": 9.386959861893159e-06,
"loss": 0.4339,
"step": 678
},
{
"epoch": 0.7314183123877918,
"grad_norm": 0.4158811867237091,
"learning_rate": 9.383947919187219e-06,
"loss": 0.3963,
"step": 679
},
{
"epoch": 0.7324955116696589,
"grad_norm": 0.40662702918052673,
"learning_rate": 9.380929081237676e-06,
"loss": 0.4725,
"step": 680
},
{
"epoch": 0.733572710951526,
"grad_norm": 0.40022268891334534,
"learning_rate": 9.377903352792672e-06,
"loss": 0.4404,
"step": 681
},
{
"epoch": 0.7346499102333932,
"grad_norm": 0.4311719536781311,
"learning_rate": 9.374870738611192e-06,
"loss": 0.4243,
"step": 682
},
{
"epoch": 0.7357271095152603,
"grad_norm": 0.3858395516872406,
"learning_rate": 9.371831243463048e-06,
"loss": 0.4383,
"step": 683
},
{
"epoch": 0.7368043087971274,
"grad_norm": 0.4157547950744629,
"learning_rate": 9.368784872128877e-06,
"loss": 0.4385,
"step": 684
},
{
"epoch": 0.7378815080789947,
"grad_norm": 0.4222763180732727,
"learning_rate": 9.36573162940013e-06,
"loss": 0.4316,
"step": 685
},
{
"epoch": 0.7389587073608618,
"grad_norm": 0.43726587295532227,
"learning_rate": 9.362671520079065e-06,
"loss": 0.4432,
"step": 686
},
{
"epoch": 0.7400359066427289,
"grad_norm": 0.47637829184532166,
"learning_rate": 9.359604548978742e-06,
"loss": 0.4476,
"step": 687
},
{
"epoch": 0.741113105924596,
"grad_norm": 0.37087079882621765,
"learning_rate": 9.356530720923012e-06,
"loss": 0.4217,
"step": 688
},
{
"epoch": 0.7421903052064632,
"grad_norm": 0.3849940598011017,
"learning_rate": 9.35345004074651e-06,
"loss": 0.4158,
"step": 689
},
{
"epoch": 0.7432675044883303,
"grad_norm": 0.4626414477825165,
"learning_rate": 9.350362513294652e-06,
"loss": 0.4254,
"step": 690
},
{
"epoch": 0.7443447037701975,
"grad_norm": 0.5151078701019287,
"learning_rate": 9.347268143423619e-06,
"loss": 0.4622,
"step": 691
},
{
"epoch": 0.7454219030520647,
"grad_norm": 0.4353354871273041,
"learning_rate": 9.344166936000356e-06,
"loss": 0.4498,
"step": 692
},
{
"epoch": 0.7464991023339318,
"grad_norm": 0.4860036075115204,
"learning_rate": 9.341058895902563e-06,
"loss": 0.4427,
"step": 693
},
{
"epoch": 0.7475763016157989,
"grad_norm": 0.38492318987846375,
"learning_rate": 9.337944028018689e-06,
"loss": 0.4217,
"step": 694
},
{
"epoch": 0.748653500897666,
"grad_norm": 0.42594560980796814,
"learning_rate": 9.334822337247916e-06,
"loss": 0.4501,
"step": 695
},
{
"epoch": 0.7497307001795332,
"grad_norm": 0.44222599267959595,
"learning_rate": 9.33169382850016e-06,
"loss": 0.433,
"step": 696
},
{
"epoch": 0.7508078994614004,
"grad_norm": 0.38337865471839905,
"learning_rate": 9.328558506696062e-06,
"loss": 0.4664,
"step": 697
},
{
"epoch": 0.7518850987432675,
"grad_norm": 0.3998524844646454,
"learning_rate": 9.325416376766978e-06,
"loss": 0.4384,
"step": 698
},
{
"epoch": 0.7529622980251347,
"grad_norm": 0.45198628306388855,
"learning_rate": 9.322267443654974e-06,
"loss": 0.4759,
"step": 699
},
{
"epoch": 0.7540394973070018,
"grad_norm": 0.3957984745502472,
"learning_rate": 9.319111712312811e-06,
"loss": 0.4623,
"step": 700
},
{
"epoch": 0.7551166965888689,
"grad_norm": 0.4148131012916565,
"learning_rate": 9.315949187703947e-06,
"loss": 0.4394,
"step": 701
},
{
"epoch": 0.756193895870736,
"grad_norm": 0.4065891206264496,
"learning_rate": 9.312779874802527e-06,
"loss": 0.4458,
"step": 702
},
{
"epoch": 0.7572710951526033,
"grad_norm": 0.4025750756263733,
"learning_rate": 9.309603778593364e-06,
"loss": 0.4411,
"step": 703
},
{
"epoch": 0.7583482944344704,
"grad_norm": 0.3583240211009979,
"learning_rate": 9.306420904071949e-06,
"loss": 0.4351,
"step": 704
},
{
"epoch": 0.7594254937163375,
"grad_norm": 0.38981419801712036,
"learning_rate": 9.30323125624443e-06,
"loss": 0.4305,
"step": 705
},
{
"epoch": 0.7605026929982047,
"grad_norm": 0.3930661976337433,
"learning_rate": 9.300034840127608e-06,
"loss": 0.4424,
"step": 706
},
{
"epoch": 0.7615798922800718,
"grad_norm": 0.3722686767578125,
"learning_rate": 9.29683166074893e-06,
"loss": 0.4333,
"step": 707
},
{
"epoch": 0.762657091561939,
"grad_norm": 0.383351594209671,
"learning_rate": 9.293621723146485e-06,
"loss": 0.4382,
"step": 708
},
{
"epoch": 0.7637342908438061,
"grad_norm": 0.3991412818431854,
"learning_rate": 9.290405032368983e-06,
"loss": 0.4391,
"step": 709
},
{
"epoch": 0.7648114901256733,
"grad_norm": 0.3822011947631836,
"learning_rate": 9.287181593475762e-06,
"loss": 0.4373,
"step": 710
},
{
"epoch": 0.7658886894075404,
"grad_norm": 0.3904517889022827,
"learning_rate": 9.283951411536774e-06,
"loss": 0.4476,
"step": 711
},
{
"epoch": 0.7669658886894075,
"grad_norm": 0.4416585862636566,
"learning_rate": 9.28071449163257e-06,
"loss": 0.4357,
"step": 712
},
{
"epoch": 0.7680430879712747,
"grad_norm": 0.376960426568985,
"learning_rate": 9.277470838854307e-06,
"loss": 0.4535,
"step": 713
},
{
"epoch": 0.7691202872531419,
"grad_norm": 0.4053441882133484,
"learning_rate": 9.274220458303727e-06,
"loss": 0.4332,
"step": 714
},
{
"epoch": 0.770197486535009,
"grad_norm": 0.37079691886901855,
"learning_rate": 9.270963355093154e-06,
"loss": 0.4827,
"step": 715
},
{
"epoch": 0.7712746858168761,
"grad_norm": 0.40362897515296936,
"learning_rate": 9.267699534345488e-06,
"loss": 0.457,
"step": 716
},
{
"epoch": 0.7723518850987433,
"grad_norm": 0.3751955032348633,
"learning_rate": 9.264429001194193e-06,
"loss": 0.4186,
"step": 717
},
{
"epoch": 0.7734290843806104,
"grad_norm": 0.353248655796051,
"learning_rate": 9.261151760783289e-06,
"loss": 0.4459,
"step": 718
},
{
"epoch": 0.7745062836624775,
"grad_norm": 0.41425472497940063,
"learning_rate": 9.257867818267347e-06,
"loss": 0.4372,
"step": 719
},
{
"epoch": 0.7755834829443446,
"grad_norm": 0.43795788288116455,
"learning_rate": 9.254577178811482e-06,
"loss": 0.4458,
"step": 720
},
{
"epoch": 0.7766606822262119,
"grad_norm": 0.3875672519207001,
"learning_rate": 9.251279847591338e-06,
"loss": 0.4455,
"step": 721
},
{
"epoch": 0.777737881508079,
"grad_norm": 0.36370179057121277,
"learning_rate": 9.247975829793086e-06,
"loss": 0.4501,
"step": 722
},
{
"epoch": 0.7788150807899461,
"grad_norm": 0.35113972425460815,
"learning_rate": 9.244665130613411e-06,
"loss": 0.428,
"step": 723
},
{
"epoch": 0.7798922800718133,
"grad_norm": 0.4404146075248718,
"learning_rate": 9.241347755259514e-06,
"loss": 0.458,
"step": 724
},
{
"epoch": 0.7809694793536804,
"grad_norm": 0.34337273240089417,
"learning_rate": 9.238023708949087e-06,
"loss": 0.4295,
"step": 725
},
{
"epoch": 0.7820466786355476,
"grad_norm": 0.34650593996047974,
"learning_rate": 9.234692996910324e-06,
"loss": 0.4494,
"step": 726
},
{
"epoch": 0.7831238779174147,
"grad_norm": 0.3992871940135956,
"learning_rate": 9.231355624381893e-06,
"loss": 0.4188,
"step": 727
},
{
"epoch": 0.7842010771992819,
"grad_norm": 0.39534062147140503,
"learning_rate": 9.22801159661295e-06,
"loss": 0.4548,
"step": 728
},
{
"epoch": 0.785278276481149,
"grad_norm": 0.37745481729507446,
"learning_rate": 9.224660918863104e-06,
"loss": 0.4464,
"step": 729
},
{
"epoch": 0.7863554757630161,
"grad_norm": 0.40657269954681396,
"learning_rate": 9.221303596402435e-06,
"loss": 0.4624,
"step": 730
},
{
"epoch": 0.7874326750448833,
"grad_norm": 0.4322795569896698,
"learning_rate": 9.217939634511473e-06,
"loss": 0.4469,
"step": 731
},
{
"epoch": 0.7885098743267505,
"grad_norm": 0.3632979094982147,
"learning_rate": 9.214569038481183e-06,
"loss": 0.4548,
"step": 732
},
{
"epoch": 0.7895870736086176,
"grad_norm": 0.3777786195278168,
"learning_rate": 9.21119181361297e-06,
"loss": 0.4183,
"step": 733
},
{
"epoch": 0.7906642728904847,
"grad_norm": 0.39092251658439636,
"learning_rate": 9.207807965218668e-06,
"loss": 0.4216,
"step": 734
},
{
"epoch": 0.7917414721723519,
"grad_norm": 0.3456818163394928,
"learning_rate": 9.204417498620521e-06,
"loss": 0.4511,
"step": 735
},
{
"epoch": 0.792818671454219,
"grad_norm": 0.39940401911735535,
"learning_rate": 9.201020419151191e-06,
"loss": 0.4586,
"step": 736
},
{
"epoch": 0.7938958707360861,
"grad_norm": 0.37644898891448975,
"learning_rate": 9.197616732153733e-06,
"loss": 0.4608,
"step": 737
},
{
"epoch": 0.7949730700179534,
"grad_norm": 0.36318060755729675,
"learning_rate": 9.194206442981601e-06,
"loss": 0.4473,
"step": 738
},
{
"epoch": 0.7960502692998205,
"grad_norm": 0.3769637644290924,
"learning_rate": 9.190789556998627e-06,
"loss": 0.4338,
"step": 739
},
{
"epoch": 0.7971274685816876,
"grad_norm": 0.37595972418785095,
"learning_rate": 9.187366079579025e-06,
"loss": 0.4374,
"step": 740
},
{
"epoch": 0.7982046678635547,
"grad_norm": 0.35937148332595825,
"learning_rate": 9.18393601610737e-06,
"loss": 0.4159,
"step": 741
},
{
"epoch": 0.7992818671454219,
"grad_norm": 0.3800438642501831,
"learning_rate": 9.180499371978603e-06,
"loss": 0.4326,
"step": 742
},
{
"epoch": 0.800359066427289,
"grad_norm": 0.36832118034362793,
"learning_rate": 9.17705615259801e-06,
"loss": 0.4255,
"step": 743
},
{
"epoch": 0.8014362657091562,
"grad_norm": 0.33298200368881226,
"learning_rate": 9.173606363381218e-06,
"loss": 0.4216,
"step": 744
},
{
"epoch": 0.8025134649910234,
"grad_norm": 0.39970219135284424,
"learning_rate": 9.170150009754193e-06,
"loss": 0.4473,
"step": 745
},
{
"epoch": 0.8035906642728905,
"grad_norm": 0.38992324471473694,
"learning_rate": 9.16668709715322e-06,
"loss": 0.4444,
"step": 746
},
{
"epoch": 0.8046678635547576,
"grad_norm": 0.5129222273826599,
"learning_rate": 9.163217631024901e-06,
"loss": 0.4381,
"step": 747
},
{
"epoch": 0.8057450628366247,
"grad_norm": 0.4020202159881592,
"learning_rate": 9.159741616826152e-06,
"loss": 0.4901,
"step": 748
},
{
"epoch": 0.806822262118492,
"grad_norm": 0.3771527111530304,
"learning_rate": 9.156259060024177e-06,
"loss": 0.452,
"step": 749
},
{
"epoch": 0.8078994614003591,
"grad_norm": 0.397592157125473,
"learning_rate": 9.152769966096483e-06,
"loss": 0.4195,
"step": 750
},
{
"epoch": 0.8089766606822262,
"grad_norm": 0.3930964171886444,
"learning_rate": 9.149274340530848e-06,
"loss": 0.4313,
"step": 751
},
{
"epoch": 0.8100538599640934,
"grad_norm": 0.3677625358104706,
"learning_rate": 9.145772188825328e-06,
"loss": 0.4126,
"step": 752
},
{
"epoch": 0.8111310592459605,
"grad_norm": 0.38228854537010193,
"learning_rate": 9.142263516488246e-06,
"loss": 0.4368,
"step": 753
},
{
"epoch": 0.8122082585278276,
"grad_norm": 0.37611261010169983,
"learning_rate": 9.138748329038175e-06,
"loss": 0.4218,
"step": 754
},
{
"epoch": 0.8132854578096947,
"grad_norm": 0.3851577341556549,
"learning_rate": 9.135226632003942e-06,
"loss": 0.4612,
"step": 755
},
{
"epoch": 0.814362657091562,
"grad_norm": 0.3648301362991333,
"learning_rate": 9.131698430924606e-06,
"loss": 0.405,
"step": 756
},
{
"epoch": 0.8154398563734291,
"grad_norm": 0.34730029106140137,
"learning_rate": 9.12816373134946e-06,
"loss": 0.4286,
"step": 757
},
{
"epoch": 0.8165170556552962,
"grad_norm": 0.42742881178855896,
"learning_rate": 9.124622538838015e-06,
"loss": 0.4343,
"step": 758
},
{
"epoch": 0.8175942549371634,
"grad_norm": 0.36673569679260254,
"learning_rate": 9.121074858959997e-06,
"loss": 0.4206,
"step": 759
},
{
"epoch": 0.8186714542190305,
"grad_norm": 0.4314405620098114,
"learning_rate": 9.117520697295337e-06,
"loss": 0.4568,
"step": 760
},
{
"epoch": 0.8197486535008977,
"grad_norm": 0.4013466238975525,
"learning_rate": 9.113960059434157e-06,
"loss": 0.4349,
"step": 761
},
{
"epoch": 0.8208258527827648,
"grad_norm": 0.4372273087501526,
"learning_rate": 9.110392950976765e-06,
"loss": 0.4273,
"step": 762
},
{
"epoch": 0.821903052064632,
"grad_norm": 0.4042438566684723,
"learning_rate": 9.10681937753365e-06,
"loss": 0.4513,
"step": 763
},
{
"epoch": 0.8229802513464991,
"grad_norm": 0.45694154500961304,
"learning_rate": 9.103239344725465e-06,
"loss": 0.4583,
"step": 764
},
{
"epoch": 0.8240574506283662,
"grad_norm": 0.4015597403049469,
"learning_rate": 9.099652858183027e-06,
"loss": 0.413,
"step": 765
},
{
"epoch": 0.8251346499102334,
"grad_norm": 0.37840738892555237,
"learning_rate": 9.0960599235473e-06,
"loss": 0.4404,
"step": 766
},
{
"epoch": 0.8262118491921006,
"grad_norm": 0.4393068552017212,
"learning_rate": 9.092460546469393e-06,
"loss": 0.4271,
"step": 767
},
{
"epoch": 0.8272890484739677,
"grad_norm": 0.3934628963470459,
"learning_rate": 9.088854732610544e-06,
"loss": 0.4474,
"step": 768
},
{
"epoch": 0.8283662477558348,
"grad_norm": 0.3966236412525177,
"learning_rate": 9.085242487642117e-06,
"loss": 0.4413,
"step": 769
},
{
"epoch": 0.829443447037702,
"grad_norm": 0.37130579352378845,
"learning_rate": 9.081623817245591e-06,
"loss": 0.4308,
"step": 770
},
{
"epoch": 0.8305206463195691,
"grad_norm": 0.45101043581962585,
"learning_rate": 9.077998727112553e-06,
"loss": 0.4425,
"step": 771
},
{
"epoch": 0.8315978456014362,
"grad_norm": 0.3699485659599304,
"learning_rate": 9.074367222944686e-06,
"loss": 0.4219,
"step": 772
},
{
"epoch": 0.8326750448833035,
"grad_norm": 0.43512436747550964,
"learning_rate": 9.070729310453759e-06,
"loss": 0.4318,
"step": 773
},
{
"epoch": 0.8337522441651706,
"grad_norm": 0.34934020042419434,
"learning_rate": 9.067084995361623e-06,
"loss": 0.4371,
"step": 774
},
{
"epoch": 0.8348294434470377,
"grad_norm": 0.3924720585346222,
"learning_rate": 9.063434283400199e-06,
"loss": 0.4445,
"step": 775
},
{
"epoch": 0.8359066427289048,
"grad_norm": 0.37331345677375793,
"learning_rate": 9.059777180311466e-06,
"loss": 0.4383,
"step": 776
},
{
"epoch": 0.836983842010772,
"grad_norm": 0.42386674880981445,
"learning_rate": 9.056113691847462e-06,
"loss": 0.423,
"step": 777
},
{
"epoch": 0.8380610412926391,
"grad_norm": 0.3515479564666748,
"learning_rate": 9.05244382377026e-06,
"loss": 0.4491,
"step": 778
},
{
"epoch": 0.8391382405745063,
"grad_norm": 0.3614935874938965,
"learning_rate": 9.048767581851973e-06,
"loss": 0.463,
"step": 779
},
{
"epoch": 0.8402154398563735,
"grad_norm": 0.4254121780395508,
"learning_rate": 9.045084971874738e-06,
"loss": 0.4083,
"step": 780
},
{
"epoch": 0.8412926391382406,
"grad_norm": 0.3752104938030243,
"learning_rate": 9.041395999630704e-06,
"loss": 0.4449,
"step": 781
},
{
"epoch": 0.8423698384201077,
"grad_norm": 0.3893478810787201,
"learning_rate": 9.037700670922034e-06,
"loss": 0.4313,
"step": 782
},
{
"epoch": 0.8434470377019748,
"grad_norm": 0.45131829380989075,
"learning_rate": 9.033998991560881e-06,
"loss": 0.4506,
"step": 783
},
{
"epoch": 0.844524236983842,
"grad_norm": 0.36699992418289185,
"learning_rate": 9.030290967369392e-06,
"loss": 0.4368,
"step": 784
},
{
"epoch": 0.8456014362657092,
"grad_norm": 0.4408794939517975,
"learning_rate": 9.026576604179689e-06,
"loss": 0.4366,
"step": 785
},
{
"epoch": 0.8466786355475763,
"grad_norm": 0.36175811290740967,
"learning_rate": 9.022855907833872e-06,
"loss": 0.44,
"step": 786
},
{
"epoch": 0.8477558348294435,
"grad_norm": 0.38207605481147766,
"learning_rate": 9.019128884183992e-06,
"loss": 0.4197,
"step": 787
},
{
"epoch": 0.8488330341113106,
"grad_norm": 0.3802523612976074,
"learning_rate": 9.015395539092057e-06,
"loss": 0.4259,
"step": 788
},
{
"epoch": 0.8499102333931777,
"grad_norm": 0.38638371229171753,
"learning_rate": 9.011655878430018e-06,
"loss": 0.4078,
"step": 789
},
{
"epoch": 0.8509874326750448,
"grad_norm": 0.3900395333766937,
"learning_rate": 9.00790990807976e-06,
"loss": 0.4568,
"step": 790
},
{
"epoch": 0.8520646319569121,
"grad_norm": 0.42712318897247314,
"learning_rate": 9.00415763393309e-06,
"loss": 0.4361,
"step": 791
},
{
"epoch": 0.8531418312387792,
"grad_norm": 0.4065674841403961,
"learning_rate": 9.000399061891728e-06,
"loss": 0.4504,
"step": 792
},
{
"epoch": 0.8542190305206463,
"grad_norm": 0.4172511696815491,
"learning_rate": 8.996634197867307e-06,
"loss": 0.4325,
"step": 793
},
{
"epoch": 0.8552962298025135,
"grad_norm": 0.40288570523262024,
"learning_rate": 8.992863047781346e-06,
"loss": 0.4309,
"step": 794
},
{
"epoch": 0.8563734290843806,
"grad_norm": 0.39536306262016296,
"learning_rate": 8.989085617565261e-06,
"loss": 0.4063,
"step": 795
},
{
"epoch": 0.8574506283662477,
"grad_norm": 0.3941093385219574,
"learning_rate": 8.985301913160338e-06,
"loss": 0.4214,
"step": 796
},
{
"epoch": 0.8585278276481149,
"grad_norm": 0.381067156791687,
"learning_rate": 8.981511940517734e-06,
"loss": 0.4384,
"step": 797
},
{
"epoch": 0.8596050269299821,
"grad_norm": 0.3987753689289093,
"learning_rate": 8.977715705598469e-06,
"loss": 0.4316,
"step": 798
},
{
"epoch": 0.8606822262118492,
"grad_norm": 0.4092214107513428,
"learning_rate": 8.973913214373405e-06,
"loss": 0.4381,
"step": 799
},
{
"epoch": 0.8617594254937163,
"grad_norm": 0.423684298992157,
"learning_rate": 8.970104472823249e-06,
"loss": 0.4426,
"step": 800
},
{
"epoch": 0.8628366247755835,
"grad_norm": 0.3508121073246002,
"learning_rate": 8.966289486938537e-06,
"loss": 0.4275,
"step": 801
},
{
"epoch": 0.8639138240574507,
"grad_norm": 0.3975047767162323,
"learning_rate": 8.96246826271963e-06,
"loss": 0.4399,
"step": 802
},
{
"epoch": 0.8649910233393178,
"grad_norm": 0.5154191255569458,
"learning_rate": 8.958640806176695e-06,
"loss": 0.4609,
"step": 803
},
{
"epoch": 0.8660682226211849,
"grad_norm": 0.33385977149009705,
"learning_rate": 8.954807123329703e-06,
"loss": 0.4431,
"step": 804
},
{
"epoch": 0.8671454219030521,
"grad_norm": 0.460953027009964,
"learning_rate": 8.950967220208425e-06,
"loss": 0.4309,
"step": 805
},
{
"epoch": 0.8682226211849192,
"grad_norm": 0.3735499680042267,
"learning_rate": 8.947121102852402e-06,
"loss": 0.445,
"step": 806
},
{
"epoch": 0.8692998204667863,
"grad_norm": 0.37893158197402954,
"learning_rate": 8.943268777310965e-06,
"loss": 0.4419,
"step": 807
},
{
"epoch": 0.8703770197486534,
"grad_norm": 0.39892786741256714,
"learning_rate": 8.939410249643195e-06,
"loss": 0.4393,
"step": 808
},
{
"epoch": 0.8714542190305207,
"grad_norm": 0.38968032598495483,
"learning_rate": 8.935545525917936e-06,
"loss": 0.4303,
"step": 809
},
{
"epoch": 0.8725314183123878,
"grad_norm": 0.35072895884513855,
"learning_rate": 8.93167461221378e-06,
"loss": 0.4472,
"step": 810
},
{
"epoch": 0.8736086175942549,
"grad_norm": 0.3633287847042084,
"learning_rate": 8.927797514619043e-06,
"loss": 0.4239,
"step": 811
},
{
"epoch": 0.8746858168761221,
"grad_norm": 0.3359943926334381,
"learning_rate": 8.923914239231779e-06,
"loss": 0.4284,
"step": 812
},
{
"epoch": 0.8757630161579892,
"grad_norm": 0.3926851749420166,
"learning_rate": 8.920024792159754e-06,
"loss": 0.4167,
"step": 813
},
{
"epoch": 0.8768402154398564,
"grad_norm": 0.37534475326538086,
"learning_rate": 8.916129179520443e-06,
"loss": 0.4273,
"step": 814
},
{
"epoch": 0.8779174147217235,
"grad_norm": 0.35639283061027527,
"learning_rate": 8.912227407441013e-06,
"loss": 0.4063,
"step": 815
},
{
"epoch": 0.8789946140035907,
"grad_norm": 0.4475279152393341,
"learning_rate": 8.908319482058325e-06,
"loss": 0.4196,
"step": 816
},
{
"epoch": 0.8800718132854578,
"grad_norm": 0.392677903175354,
"learning_rate": 8.904405409518916e-06,
"loss": 0.4389,
"step": 817
},
{
"epoch": 0.8811490125673249,
"grad_norm": 0.3904305100440979,
"learning_rate": 8.90048519597899e-06,
"loss": 0.4591,
"step": 818
},
{
"epoch": 0.8822262118491921,
"grad_norm": 0.35532963275909424,
"learning_rate": 8.896558847604414e-06,
"loss": 0.4203,
"step": 819
},
{
"epoch": 0.8833034111310593,
"grad_norm": 0.3612072467803955,
"learning_rate": 8.892626370570699e-06,
"loss": 0.4271,
"step": 820
},
{
"epoch": 0.8843806104129264,
"grad_norm": 0.3656427562236786,
"learning_rate": 8.888687771062999e-06,
"loss": 0.4275,
"step": 821
},
{
"epoch": 0.8854578096947935,
"grad_norm": 0.3995424211025238,
"learning_rate": 8.884743055276092e-06,
"loss": 0.4418,
"step": 822
},
{
"epoch": 0.8865350089766607,
"grad_norm": 0.37365880608558655,
"learning_rate": 8.880792229414387e-06,
"loss": 0.4201,
"step": 823
},
{
"epoch": 0.8876122082585278,
"grad_norm": 0.4127524793148041,
"learning_rate": 8.876835299691892e-06,
"loss": 0.4103,
"step": 824
},
{
"epoch": 0.8886894075403949,
"grad_norm": 0.3745189309120178,
"learning_rate": 8.87287227233222e-06,
"loss": 0.4315,
"step": 825
},
{
"epoch": 0.8897666068222622,
"grad_norm": 0.3703850507736206,
"learning_rate": 8.868903153568577e-06,
"loss": 0.4111,
"step": 826
},
{
"epoch": 0.8908438061041293,
"grad_norm": 0.366468608379364,
"learning_rate": 8.864927949643744e-06,
"loss": 0.4236,
"step": 827
},
{
"epoch": 0.8919210053859964,
"grad_norm": 0.4204203486442566,
"learning_rate": 8.860946666810078e-06,
"loss": 0.4395,
"step": 828
},
{
"epoch": 0.8929982046678635,
"grad_norm": 0.40308547019958496,
"learning_rate": 8.856959311329495e-06,
"loss": 0.4254,
"step": 829
},
{
"epoch": 0.8940754039497307,
"grad_norm": 0.38909968733787537,
"learning_rate": 8.852965889473464e-06,
"loss": 0.4308,
"step": 830
},
{
"epoch": 0.8951526032315978,
"grad_norm": 0.4318833649158478,
"learning_rate": 8.848966407522992e-06,
"loss": 0.4354,
"step": 831
},
{
"epoch": 0.896229802513465,
"grad_norm": 0.3676931858062744,
"learning_rate": 8.844960871768618e-06,
"loss": 0.45,
"step": 832
},
{
"epoch": 0.8973070017953322,
"grad_norm": 0.39167362451553345,
"learning_rate": 8.84094928851041e-06,
"loss": 0.4468,
"step": 833
},
{
"epoch": 0.8983842010771993,
"grad_norm": 0.3510020673274994,
"learning_rate": 8.836931664057935e-06,
"loss": 0.4294,
"step": 834
},
{
"epoch": 0.8994614003590664,
"grad_norm": 0.3821624517440796,
"learning_rate": 8.832908004730274e-06,
"loss": 0.4422,
"step": 835
},
{
"epoch": 0.9005385996409335,
"grad_norm": 0.38731488585472107,
"learning_rate": 8.828878316855994e-06,
"loss": 0.4066,
"step": 836
},
{
"epoch": 0.9016157989228007,
"grad_norm": 0.34374868869781494,
"learning_rate": 8.824842606773142e-06,
"loss": 0.4421,
"step": 837
},
{
"epoch": 0.9026929982046679,
"grad_norm": 0.41355276107788086,
"learning_rate": 8.82080088082924e-06,
"loss": 0.421,
"step": 838
},
{
"epoch": 0.903770197486535,
"grad_norm": 0.3896749019622803,
"learning_rate": 8.816753145381276e-06,
"loss": 0.4304,
"step": 839
},
{
"epoch": 0.9048473967684022,
"grad_norm": 0.39807990193367004,
"learning_rate": 8.812699406795683e-06,
"loss": 0.4462,
"step": 840
},
{
"epoch": 0.9059245960502693,
"grad_norm": 0.34178224205970764,
"learning_rate": 8.808639671448334e-06,
"loss": 0.4185,
"step": 841
},
{
"epoch": 0.9070017953321364,
"grad_norm": 0.3645724058151245,
"learning_rate": 8.804573945724544e-06,
"loss": 0.4167,
"step": 842
},
{
"epoch": 0.9080789946140035,
"grad_norm": 0.35154834389686584,
"learning_rate": 8.800502236019045e-06,
"loss": 0.4142,
"step": 843
},
{
"epoch": 0.9091561938958708,
"grad_norm": 0.43520885705947876,
"learning_rate": 8.796424548735975e-06,
"loss": 0.4486,
"step": 844
},
{
"epoch": 0.9102333931777379,
"grad_norm": 0.38887906074523926,
"learning_rate": 8.792340890288884e-06,
"loss": 0.4419,
"step": 845
},
{
"epoch": 0.911310592459605,
"grad_norm": 0.3473518192768097,
"learning_rate": 8.788251267100704e-06,
"loss": 0.4151,
"step": 846
},
{
"epoch": 0.9123877917414722,
"grad_norm": 0.4001302421092987,
"learning_rate": 8.78415568560376e-06,
"loss": 0.4186,
"step": 847
},
{
"epoch": 0.9134649910233393,
"grad_norm": 0.4523400664329529,
"learning_rate": 8.780054152239734e-06,
"loss": 0.4695,
"step": 848
},
{
"epoch": 0.9145421903052064,
"grad_norm": 0.36641725897789,
"learning_rate": 8.775946673459682e-06,
"loss": 0.4095,
"step": 849
},
{
"epoch": 0.9156193895870736,
"grad_norm": 0.42685094475746155,
"learning_rate": 8.771833255724004e-06,
"loss": 0.4286,
"step": 850
},
{
"epoch": 0.9166965888689408,
"grad_norm": 0.46389880776405334,
"learning_rate": 8.767713905502444e-06,
"loss": 0.4668,
"step": 851
},
{
"epoch": 0.9177737881508079,
"grad_norm": 0.3745497465133667,
"learning_rate": 8.763588629274077e-06,
"loss": 0.4483,
"step": 852
},
{
"epoch": 0.918850987432675,
"grad_norm": 0.4221557378768921,
"learning_rate": 8.759457433527296e-06,
"loss": 0.4492,
"step": 853
},
{
"epoch": 0.9199281867145422,
"grad_norm": 0.4107033908367157,
"learning_rate": 8.755320324759808e-06,
"loss": 0.4216,
"step": 854
},
{
"epoch": 0.9210053859964094,
"grad_norm": 0.37441039085388184,
"learning_rate": 8.751177309478618e-06,
"loss": 0.4021,
"step": 855
},
{
"epoch": 0.9220825852782765,
"grad_norm": 0.4322914481163025,
"learning_rate": 8.747028394200019e-06,
"loss": 0.4658,
"step": 856
},
{
"epoch": 0.9231597845601436,
"grad_norm": 0.4002901315689087,
"learning_rate": 8.74287358544959e-06,
"loss": 0.4388,
"step": 857
},
{
"epoch": 0.9242369838420108,
"grad_norm": 0.4236518442630768,
"learning_rate": 8.73871288976217e-06,
"loss": 0.4258,
"step": 858
},
{
"epoch": 0.9253141831238779,
"grad_norm": 0.41266000270843506,
"learning_rate": 8.734546313681869e-06,
"loss": 0.4204,
"step": 859
},
{
"epoch": 0.926391382405745,
"grad_norm": 0.4816506505012512,
"learning_rate": 8.730373863762036e-06,
"loss": 0.405,
"step": 860
},
{
"epoch": 0.9274685816876123,
"grad_norm": 0.36234742403030396,
"learning_rate": 8.726195546565264e-06,
"loss": 0.4455,
"step": 861
},
{
"epoch": 0.9285457809694794,
"grad_norm": 0.4364088177680969,
"learning_rate": 8.722011368663373e-06,
"loss": 0.4361,
"step": 862
},
{
"epoch": 0.9296229802513465,
"grad_norm": 0.46902528405189514,
"learning_rate": 8.717821336637397e-06,
"loss": 0.4379,
"step": 863
},
{
"epoch": 0.9307001795332136,
"grad_norm": 0.3549644649028778,
"learning_rate": 8.713625457077585e-06,
"loss": 0.4384,
"step": 864
},
{
"epoch": 0.9317773788150808,
"grad_norm": 0.4156612157821655,
"learning_rate": 8.70942373658338e-06,
"loss": 0.4329,
"step": 865
},
{
"epoch": 0.9328545780969479,
"grad_norm": 0.4271438717842102,
"learning_rate": 8.705216181763407e-06,
"loss": 0.4279,
"step": 866
},
{
"epoch": 0.933931777378815,
"grad_norm": 0.3731890022754669,
"learning_rate": 8.701002799235475e-06,
"loss": 0.4318,
"step": 867
},
{
"epoch": 0.9350089766606823,
"grad_norm": 0.4255085587501526,
"learning_rate": 8.696783595626555e-06,
"loss": 0.4375,
"step": 868
},
{
"epoch": 0.9360861759425494,
"grad_norm": 0.36597201228141785,
"learning_rate": 8.692558577572773e-06,
"loss": 0.4118,
"step": 869
},
{
"epoch": 0.9371633752244165,
"grad_norm": 0.42821747064590454,
"learning_rate": 8.688327751719403e-06,
"loss": 0.4368,
"step": 870
},
{
"epoch": 0.9382405745062836,
"grad_norm": 0.47871869802474976,
"learning_rate": 8.684091124720852e-06,
"loss": 0.4417,
"step": 871
},
{
"epoch": 0.9393177737881508,
"grad_norm": 0.3639247715473175,
"learning_rate": 8.679848703240652e-06,
"loss": 0.4333,
"step": 872
},
{
"epoch": 0.940394973070018,
"grad_norm": 0.45801442861557007,
"learning_rate": 8.675600493951448e-06,
"loss": 0.4501,
"step": 873
},
{
"epoch": 0.9414721723518851,
"grad_norm": 0.39988264441490173,
"learning_rate": 8.671346503534987e-06,
"loss": 0.4096,
"step": 874
},
{
"epoch": 0.9425493716337523,
"grad_norm": 0.4402971565723419,
"learning_rate": 8.667086738682114e-06,
"loss": 0.4286,
"step": 875
},
{
"epoch": 0.9436265709156194,
"grad_norm": 0.42627960443496704,
"learning_rate": 8.662821206092749e-06,
"loss": 0.4597,
"step": 876
},
{
"epoch": 0.9447037701974865,
"grad_norm": 0.4394327402114868,
"learning_rate": 8.65854991247589e-06,
"loss": 0.4249,
"step": 877
},
{
"epoch": 0.9457809694793536,
"grad_norm": 0.40527400374412537,
"learning_rate": 8.654272864549592e-06,
"loss": 0.4246,
"step": 878
},
{
"epoch": 0.9468581687612209,
"grad_norm": 0.35895228385925293,
"learning_rate": 8.64999006904096e-06,
"loss": 0.4421,
"step": 879
},
{
"epoch": 0.947935368043088,
"grad_norm": 0.4643931984901428,
"learning_rate": 8.645701532686146e-06,
"loss": 0.4203,
"step": 880
},
{
"epoch": 0.9490125673249551,
"grad_norm": 0.40228623151779175,
"learning_rate": 8.641407262230325e-06,
"loss": 0.4416,
"step": 881
},
{
"epoch": 0.9500897666068223,
"grad_norm": 0.3212444484233856,
"learning_rate": 8.63710726442769e-06,
"loss": 0.4188,
"step": 882
},
{
"epoch": 0.9511669658886894,
"grad_norm": 0.41637882590293884,
"learning_rate": 8.632801546041447e-06,
"loss": 0.438,
"step": 883
},
{
"epoch": 0.9522441651705565,
"grad_norm": 0.3686106503009796,
"learning_rate": 8.628490113843798e-06,
"loss": 0.4165,
"step": 884
},
{
"epoch": 0.9533213644524237,
"grad_norm": 0.3435036540031433,
"learning_rate": 8.624172974615926e-06,
"loss": 0.4005,
"step": 885
},
{
"epoch": 0.9543985637342909,
"grad_norm": 0.40801119804382324,
"learning_rate": 8.619850135148002e-06,
"loss": 0.44,
"step": 886
},
{
"epoch": 0.955475763016158,
"grad_norm": 0.3914053738117218,
"learning_rate": 8.615521602239151e-06,
"loss": 0.43,
"step": 887
},
{
"epoch": 0.9565529622980251,
"grad_norm": 0.3459240794181824,
"learning_rate": 8.611187382697459e-06,
"loss": 0.4501,
"step": 888
},
{
"epoch": 0.9576301615798922,
"grad_norm": 0.4208565652370453,
"learning_rate": 8.606847483339957e-06,
"loss": 0.4423,
"step": 889
},
{
"epoch": 0.9587073608617595,
"grad_norm": 0.402261883020401,
"learning_rate": 8.602501910992604e-06,
"loss": 0.4305,
"step": 890
},
{
"epoch": 0.9597845601436266,
"grad_norm": 0.41275733709335327,
"learning_rate": 8.598150672490289e-06,
"loss": 0.4377,
"step": 891
},
{
"epoch": 0.9608617594254937,
"grad_norm": 0.4121462106704712,
"learning_rate": 8.593793774676804e-06,
"loss": 0.4474,
"step": 892
},
{
"epoch": 0.9619389587073609,
"grad_norm": 0.3706258237361908,
"learning_rate": 8.58943122440485e-06,
"loss": 0.4359,
"step": 893
},
{
"epoch": 0.963016157989228,
"grad_norm": 0.3740246891975403,
"learning_rate": 8.585063028536015e-06,
"loss": 0.4366,
"step": 894
},
{
"epoch": 0.9640933572710951,
"grad_norm": 0.37208443880081177,
"learning_rate": 8.58068919394077e-06,
"loss": 0.4444,
"step": 895
},
{
"epoch": 0.9651705565529622,
"grad_norm": 0.39575937390327454,
"learning_rate": 8.576309727498446e-06,
"loss": 0.4347,
"step": 896
},
{
"epoch": 0.9662477558348295,
"grad_norm": 0.4086984395980835,
"learning_rate": 8.571924636097245e-06,
"loss": 0.4563,
"step": 897
},
{
"epoch": 0.9673249551166966,
"grad_norm": 0.34942734241485596,
"learning_rate": 8.567533926634203e-06,
"loss": 0.4363,
"step": 898
},
{
"epoch": 0.9684021543985637,
"grad_norm": 0.38321834802627563,
"learning_rate": 8.563137606015201e-06,
"loss": 0.4313,
"step": 899
},
{
"epoch": 0.9694793536804309,
"grad_norm": 0.4460221529006958,
"learning_rate": 8.558735681154944e-06,
"loss": 0.4113,
"step": 900
},
{
"epoch": 0.970556552962298,
"grad_norm": 0.42695295810699463,
"learning_rate": 8.554328158976948e-06,
"loss": 0.423,
"step": 901
},
{
"epoch": 0.9716337522441651,
"grad_norm": 0.3403206467628479,
"learning_rate": 8.549915046413537e-06,
"loss": 0.4425,
"step": 902
},
{
"epoch": 0.9727109515260323,
"grad_norm": 0.34595513343811035,
"learning_rate": 8.545496350405825e-06,
"loss": 0.4319,
"step": 903
},
{
"epoch": 0.9737881508078995,
"grad_norm": 0.43634214997291565,
"learning_rate": 8.54107207790371e-06,
"loss": 0.4289,
"step": 904
},
{
"epoch": 0.9748653500897666,
"grad_norm": 0.3104211091995239,
"learning_rate": 8.536642235865857e-06,
"loss": 0.4062,
"step": 905
},
{
"epoch": 0.9759425493716337,
"grad_norm": 0.37977078557014465,
"learning_rate": 8.532206831259695e-06,
"loss": 0.416,
"step": 906
},
{
"epoch": 0.9770197486535009,
"grad_norm": 0.35528695583343506,
"learning_rate": 8.527765871061403e-06,
"loss": 0.4385,
"step": 907
},
{
"epoch": 0.978096947935368,
"grad_norm": 0.3993757665157318,
"learning_rate": 8.523319362255894e-06,
"loss": 0.4515,
"step": 908
},
{
"epoch": 0.9791741472172352,
"grad_norm": 0.36851876974105835,
"learning_rate": 8.518867311836808e-06,
"loss": 0.419,
"step": 909
},
{
"epoch": 0.9802513464991023,
"grad_norm": 0.3733499348163605,
"learning_rate": 8.514409726806506e-06,
"loss": 0.4422,
"step": 910
},
{
"epoch": 0.9813285457809695,
"grad_norm": 0.37746962904930115,
"learning_rate": 8.509946614176047e-06,
"loss": 0.4402,
"step": 911
},
{
"epoch": 0.9824057450628366,
"grad_norm": 0.3641514480113983,
"learning_rate": 8.505477980965191e-06,
"loss": 0.4472,
"step": 912
},
{
"epoch": 0.9834829443447037,
"grad_norm": 0.41282108426094055,
"learning_rate": 8.501003834202377e-06,
"loss": 0.4536,
"step": 913
},
{
"epoch": 0.984560143626571,
"grad_norm": 0.38131409883499146,
"learning_rate": 8.49652418092472e-06,
"loss": 0.437,
"step": 914
},
{
"epoch": 0.9856373429084381,
"grad_norm": 0.4030049741268158,
"learning_rate": 8.492039028177985e-06,
"loss": 0.4174,
"step": 915
},
{
"epoch": 0.9867145421903052,
"grad_norm": 0.4333740770816803,
"learning_rate": 8.487548383016602e-06,
"loss": 0.4381,
"step": 916
},
{
"epoch": 0.9877917414721723,
"grad_norm": 0.3468566834926605,
"learning_rate": 8.483052252503629e-06,
"loss": 0.4582,
"step": 917
},
{
"epoch": 0.9888689407540395,
"grad_norm": 0.36902710795402527,
"learning_rate": 8.478550643710754e-06,
"loss": 0.4361,
"step": 918
},
{
"epoch": 0.9899461400359066,
"grad_norm": 0.38729527592658997,
"learning_rate": 8.474043563718287e-06,
"loss": 0.4302,
"step": 919
},
{
"epoch": 0.9910233393177738,
"grad_norm": 0.38706785440444946,
"learning_rate": 8.469531019615132e-06,
"loss": 0.4726,
"step": 920
},
{
"epoch": 0.992100538599641,
"grad_norm": 0.3167583644390106,
"learning_rate": 8.465013018498796e-06,
"loss": 0.4359,
"step": 921
},
{
"epoch": 0.9931777378815081,
"grad_norm": 0.360503226518631,
"learning_rate": 8.460489567475367e-06,
"loss": 0.4346,
"step": 922
},
{
"epoch": 0.9942549371633752,
"grad_norm": 0.42758846282958984,
"learning_rate": 8.455960673659507e-06,
"loss": 0.4714,
"step": 923
},
{
"epoch": 0.9953321364452423,
"grad_norm": 0.3499755859375,
"learning_rate": 8.451426344174433e-06,
"loss": 0.4286,
"step": 924
},
{
"epoch": 0.9964093357271095,
"grad_norm": 0.40906208753585815,
"learning_rate": 8.446886586151914e-06,
"loss": 0.4267,
"step": 925
},
{
"epoch": 0.9974865350089767,
"grad_norm": 0.36410462856292725,
"learning_rate": 8.442341406732261e-06,
"loss": 0.4222,
"step": 926
},
{
"epoch": 0.9985637342908438,
"grad_norm": 0.42686885595321655,
"learning_rate": 8.437790813064305e-06,
"loss": 0.4459,
"step": 927
},
{
"epoch": 0.999640933572711,
"grad_norm": 0.36179864406585693,
"learning_rate": 8.433234812305402e-06,
"loss": 0.4501,
"step": 928
},
{
"epoch": 1.000718132854578,
"grad_norm": 0.7051083445549011,
"learning_rate": 8.4286734116214e-06,
"loss": 0.6643,
"step": 929
},
{
"epoch": 1.0017953321364452,
"grad_norm": 0.39998215436935425,
"learning_rate": 8.424106618186653e-06,
"loss": 0.4096,
"step": 930
},
{
"epoch": 1.0028725314183125,
"grad_norm": 0.39742282032966614,
"learning_rate": 8.419534439183987e-06,
"loss": 0.4254,
"step": 931
},
{
"epoch": 1.0039497307001795,
"grad_norm": 0.37354665994644165,
"learning_rate": 8.414956881804706e-06,
"loss": 0.3737,
"step": 932
},
{
"epoch": 1.0050269299820467,
"grad_norm": 0.38051116466522217,
"learning_rate": 8.41037395324857e-06,
"loss": 0.3806,
"step": 933
},
{
"epoch": 1.006104129263914,
"grad_norm": 0.44809573888778687,
"learning_rate": 8.405785660723784e-06,
"loss": 0.448,
"step": 934
},
{
"epoch": 1.007181328545781,
"grad_norm": 0.42937591671943665,
"learning_rate": 8.401192011446995e-06,
"loss": 0.3847,
"step": 935
},
{
"epoch": 1.0082585278276481,
"grad_norm": 0.45236900448799133,
"learning_rate": 8.396593012643272e-06,
"loss": 0.4186,
"step": 936
},
{
"epoch": 1.0093357271095154,
"grad_norm": 0.4174693822860718,
"learning_rate": 8.391988671546099e-06,
"loss": 0.3773,
"step": 937
},
{
"epoch": 1.0104129263913824,
"grad_norm": 0.446155309677124,
"learning_rate": 8.387378995397363e-06,
"loss": 0.4318,
"step": 938
},
{
"epoch": 1.0114901256732496,
"grad_norm": 0.3945539593696594,
"learning_rate": 8.382763991447344e-06,
"loss": 0.4022,
"step": 939
},
{
"epoch": 1.0125673249551166,
"grad_norm": 0.38236725330352783,
"learning_rate": 8.378143666954696e-06,
"loss": 0.3545,
"step": 940
},
{
"epoch": 1.0136445242369838,
"grad_norm": 0.49245333671569824,
"learning_rate": 8.373518029186448e-06,
"loss": 0.4435,
"step": 941
},
{
"epoch": 1.014721723518851,
"grad_norm": 0.4444209039211273,
"learning_rate": 8.368887085417979e-06,
"loss": 0.4233,
"step": 942
},
{
"epoch": 1.015798922800718,
"grad_norm": 0.38781559467315674,
"learning_rate": 8.364250842933019e-06,
"loss": 0.3919,
"step": 943
},
{
"epoch": 1.0168761220825853,
"grad_norm": 0.47153082489967346,
"learning_rate": 8.359609309023632e-06,
"loss": 0.4201,
"step": 944
},
{
"epoch": 1.0179533213644525,
"grad_norm": 0.39679715037345886,
"learning_rate": 8.354962490990202e-06,
"loss": 0.3837,
"step": 945
},
{
"epoch": 1.0190305206463195,
"grad_norm": 0.440632164478302,
"learning_rate": 8.350310396141424e-06,
"loss": 0.4389,
"step": 946
},
{
"epoch": 1.0201077199281867,
"grad_norm": 0.3632691204547882,
"learning_rate": 8.345653031794292e-06,
"loss": 0.3517,
"step": 947
},
{
"epoch": 1.021184919210054,
"grad_norm": 0.49212685227394104,
"learning_rate": 8.340990405274092e-06,
"loss": 0.4624,
"step": 948
},
{
"epoch": 1.022262118491921,
"grad_norm": 0.3657776713371277,
"learning_rate": 8.336322523914385e-06,
"loss": 0.4006,
"step": 949
},
{
"epoch": 1.0233393177737882,
"grad_norm": 0.47523999214172363,
"learning_rate": 8.331649395056996e-06,
"loss": 0.434,
"step": 950
},
{
"epoch": 1.0244165170556554,
"grad_norm": 0.3606716990470886,
"learning_rate": 8.326971026052e-06,
"loss": 0.3835,
"step": 951
},
{
"epoch": 1.0254937163375224,
"grad_norm": 0.4083137810230255,
"learning_rate": 8.32228742425772e-06,
"loss": 0.4629,
"step": 952
},
{
"epoch": 1.0265709156193896,
"grad_norm": 0.35142168402671814,
"learning_rate": 8.317598597040706e-06,
"loss": 0.3599,
"step": 953
},
{
"epoch": 1.0276481149012566,
"grad_norm": 0.4037002623081207,
"learning_rate": 8.312904551775731e-06,
"loss": 0.4073,
"step": 954
},
{
"epoch": 1.0287253141831239,
"grad_norm": 0.39003700017929077,
"learning_rate": 8.308205295845769e-06,
"loss": 0.4228,
"step": 955
},
{
"epoch": 1.029802513464991,
"grad_norm": 0.41409826278686523,
"learning_rate": 8.303500836641992e-06,
"loss": 0.4328,
"step": 956
},
{
"epoch": 1.030879712746858,
"grad_norm": 0.3729735314846039,
"learning_rate": 8.298791181563755e-06,
"loss": 0.4083,
"step": 957
},
{
"epoch": 1.0319569120287253,
"grad_norm": 0.39012017846107483,
"learning_rate": 8.29407633801859e-06,
"loss": 0.383,
"step": 958
},
{
"epoch": 1.0330341113105925,
"grad_norm": 0.4182110130786896,
"learning_rate": 8.289356313422182e-06,
"loss": 0.403,
"step": 959
},
{
"epoch": 1.0341113105924595,
"grad_norm": 0.4065341651439667,
"learning_rate": 8.284631115198371e-06,
"loss": 0.4432,
"step": 960
},
{
"epoch": 1.0351885098743268,
"grad_norm": 0.40570753812789917,
"learning_rate": 8.279900750779137e-06,
"loss": 0.3694,
"step": 961
},
{
"epoch": 1.036265709156194,
"grad_norm": 0.42988771200180054,
"learning_rate": 8.275165227604574e-06,
"loss": 0.4224,
"step": 962
},
{
"epoch": 1.037342908438061,
"grad_norm": 0.35634714365005493,
"learning_rate": 8.2704245531229e-06,
"loss": 0.3693,
"step": 963
},
{
"epoch": 1.0384201077199282,
"grad_norm": 0.4628054201602936,
"learning_rate": 8.26567873479043e-06,
"loss": 0.4245,
"step": 964
},
{
"epoch": 1.0394973070017954,
"grad_norm": 0.39119595289230347,
"learning_rate": 8.260927780071572e-06,
"loss": 0.3917,
"step": 965
},
{
"epoch": 1.0405745062836624,
"grad_norm": 0.39787107706069946,
"learning_rate": 8.256171696438817e-06,
"loss": 0.4077,
"step": 966
},
{
"epoch": 1.0416517055655297,
"grad_norm": 0.38707029819488525,
"learning_rate": 8.251410491372711e-06,
"loss": 0.39,
"step": 967
},
{
"epoch": 1.0427289048473967,
"grad_norm": 0.42183634638786316,
"learning_rate": 8.246644172361866e-06,
"loss": 0.4129,
"step": 968
},
{
"epoch": 1.043806104129264,
"grad_norm": 0.39003661274909973,
"learning_rate": 8.241872746902934e-06,
"loss": 0.3921,
"step": 969
},
{
"epoch": 1.0448833034111311,
"grad_norm": 0.3627468943595886,
"learning_rate": 8.237096222500597e-06,
"loss": 0.418,
"step": 970
},
{
"epoch": 1.0459605026929981,
"grad_norm": 0.40649735927581787,
"learning_rate": 8.232314606667559e-06,
"loss": 0.4213,
"step": 971
},
{
"epoch": 1.0470377019748653,
"grad_norm": 0.44418761134147644,
"learning_rate": 8.22752790692453e-06,
"loss": 0.4184,
"step": 972
},
{
"epoch": 1.0481149012567326,
"grad_norm": 0.3849025070667267,
"learning_rate": 8.222736130800219e-06,
"loss": 0.42,
"step": 973
},
{
"epoch": 1.0491921005385996,
"grad_norm": 0.36222249269485474,
"learning_rate": 8.217939285831315e-06,
"loss": 0.3772,
"step": 974
},
{
"epoch": 1.0502692998204668,
"grad_norm": 0.4193369448184967,
"learning_rate": 8.213137379562486e-06,
"loss": 0.4088,
"step": 975
},
{
"epoch": 1.051346499102334,
"grad_norm": 0.3462792634963989,
"learning_rate": 8.208330419546353e-06,
"loss": 0.3855,
"step": 976
},
{
"epoch": 1.052423698384201,
"grad_norm": 0.4043339788913727,
"learning_rate": 8.203518413343492e-06,
"loss": 0.4117,
"step": 977
},
{
"epoch": 1.0535008976660682,
"grad_norm": 0.36096611618995667,
"learning_rate": 8.198701368522413e-06,
"loss": 0.4169,
"step": 978
},
{
"epoch": 1.0545780969479355,
"grad_norm": 0.34872111678123474,
"learning_rate": 8.19387929265955e-06,
"loss": 0.4248,
"step": 979
},
{
"epoch": 1.0556552962298025,
"grad_norm": 0.34945443272590637,
"learning_rate": 8.189052193339251e-06,
"loss": 0.4002,
"step": 980
},
{
"epoch": 1.0567324955116697,
"grad_norm": 0.3416961133480072,
"learning_rate": 8.184220078153768e-06,
"loss": 0.3687,
"step": 981
},
{
"epoch": 1.0578096947935367,
"grad_norm": 0.3593612611293793,
"learning_rate": 8.179382954703236e-06,
"loss": 0.4435,
"step": 982
},
{
"epoch": 1.058886894075404,
"grad_norm": 0.40099433064460754,
"learning_rate": 8.174540830595674e-06,
"loss": 0.4039,
"step": 983
},
{
"epoch": 1.0599640933572712,
"grad_norm": 0.34969210624694824,
"learning_rate": 8.16969371344696e-06,
"loss": 0.4188,
"step": 984
},
{
"epoch": 1.0610412926391382,
"grad_norm": 0.40064018964767456,
"learning_rate": 8.16484161088083e-06,
"loss": 0.427,
"step": 985
},
{
"epoch": 1.0621184919210054,
"grad_norm": 0.3965623080730438,
"learning_rate": 8.159984530528859e-06,
"loss": 0.3961,
"step": 986
},
{
"epoch": 1.0631956912028726,
"grad_norm": 0.37393060326576233,
"learning_rate": 8.155122480030454e-06,
"loss": 0.4067,
"step": 987
},
{
"epoch": 1.0642728904847396,
"grad_norm": 0.39971593022346497,
"learning_rate": 8.150255467032831e-06,
"loss": 0.4462,
"step": 988
},
{
"epoch": 1.0653500897666068,
"grad_norm": 0.3918885588645935,
"learning_rate": 8.14538349919102e-06,
"loss": 0.4158,
"step": 989
},
{
"epoch": 1.066427289048474,
"grad_norm": 0.39894959330558777,
"learning_rate": 8.140506584167845e-06,
"loss": 0.4069,
"step": 990
},
{
"epoch": 1.067504488330341,
"grad_norm": 0.34239354729652405,
"learning_rate": 8.135624729633902e-06,
"loss": 0.4134,
"step": 991
},
{
"epoch": 1.0685816876122083,
"grad_norm": 0.4456005394458771,
"learning_rate": 8.130737943267563e-06,
"loss": 0.4018,
"step": 992
},
{
"epoch": 1.0696588868940755,
"grad_norm": 0.3645940124988556,
"learning_rate": 8.12584623275496e-06,
"loss": 0.4127,
"step": 993
},
{
"epoch": 1.0707360861759425,
"grad_norm": 0.5911320447921753,
"learning_rate": 8.12094960578996e-06,
"loss": 0.3952,
"step": 994
},
{
"epoch": 1.0718132854578097,
"grad_norm": 0.3891417384147644,
"learning_rate": 8.11604807007417e-06,
"loss": 0.4177,
"step": 995
},
{
"epoch": 1.0728904847396767,
"grad_norm": 0.3503113090991974,
"learning_rate": 8.111141633316914e-06,
"loss": 0.3882,
"step": 996
},
{
"epoch": 1.073967684021544,
"grad_norm": 0.3674125373363495,
"learning_rate": 8.10623030323523e-06,
"loss": 0.4164,
"step": 997
},
{
"epoch": 1.0750448833034112,
"grad_norm": 0.36334046721458435,
"learning_rate": 8.101314087553845e-06,
"loss": 0.3952,
"step": 998
},
{
"epoch": 1.0761220825852782,
"grad_norm": 0.35265591740608215,
"learning_rate": 8.096392994005177e-06,
"loss": 0.4227,
"step": 999
},
{
"epoch": 1.0771992818671454,
"grad_norm": 0.3510865569114685,
"learning_rate": 8.091467030329309e-06,
"loss": 0.3634,
"step": 1000
},
{
"epoch": 1.0782764811490126,
"grad_norm": 0.38359448313713074,
"learning_rate": 8.086536204273994e-06,
"loss": 0.4036,
"step": 1001
},
{
"epoch": 1.0793536804308796,
"grad_norm": 0.3547118008136749,
"learning_rate": 8.081600523594622e-06,
"loss": 0.4408,
"step": 1002
},
{
"epoch": 1.0804308797127469,
"grad_norm": 0.3304808437824249,
"learning_rate": 8.076659996054226e-06,
"loss": 0.3613,
"step": 1003
},
{
"epoch": 1.081508078994614,
"grad_norm": 0.34392449259757996,
"learning_rate": 8.071714629423459e-06,
"loss": 0.4102,
"step": 1004
},
{
"epoch": 1.082585278276481,
"grad_norm": 0.3510746657848358,
"learning_rate": 8.066764431480584e-06,
"loss": 0.4384,
"step": 1005
},
{
"epoch": 1.0836624775583483,
"grad_norm": 0.35708552598953247,
"learning_rate": 8.061809410011466e-06,
"loss": 0.4413,
"step": 1006
},
{
"epoch": 1.0847396768402153,
"grad_norm": 0.35590386390686035,
"learning_rate": 8.056849572809555e-06,
"loss": 0.4162,
"step": 1007
},
{
"epoch": 1.0858168761220826,
"grad_norm": 0.3295712172985077,
"learning_rate": 8.051884927675879e-06,
"loss": 0.4208,
"step": 1008
},
{
"epoch": 1.0868940754039498,
"grad_norm": 0.3516232967376709,
"learning_rate": 8.046915482419018e-06,
"loss": 0.3652,
"step": 1009
},
{
"epoch": 1.0879712746858168,
"grad_norm": 0.3285958170890808,
"learning_rate": 8.041941244855113e-06,
"loss": 0.3883,
"step": 1010
},
{
"epoch": 1.089048473967684,
"grad_norm": 0.35579437017440796,
"learning_rate": 8.036962222807838e-06,
"loss": 0.4285,
"step": 1011
},
{
"epoch": 1.0901256732495512,
"grad_norm": 0.4027876853942871,
"learning_rate": 8.031978424108392e-06,
"loss": 0.4357,
"step": 1012
},
{
"epoch": 1.0912028725314182,
"grad_norm": 0.3562896251678467,
"learning_rate": 8.026989856595486e-06,
"loss": 0.437,
"step": 1013
},
{
"epoch": 1.0922800718132855,
"grad_norm": 0.373172402381897,
"learning_rate": 8.021996528115335e-06,
"loss": 0.3692,
"step": 1014
},
{
"epoch": 1.0933572710951527,
"grad_norm": 0.3635936379432678,
"learning_rate": 8.016998446521637e-06,
"loss": 0.4235,
"step": 1015
},
{
"epoch": 1.0944344703770197,
"grad_norm": 0.3417677581310272,
"learning_rate": 8.011995619675572e-06,
"loss": 0.4168,
"step": 1016
},
{
"epoch": 1.095511669658887,
"grad_norm": 0.46394291520118713,
"learning_rate": 8.00698805544578e-06,
"loss": 0.4154,
"step": 1017
},
{
"epoch": 1.0965888689407541,
"grad_norm": 0.3501376807689667,
"learning_rate": 8.001975761708348e-06,
"loss": 0.4171,
"step": 1018
},
{
"epoch": 1.0976660682226211,
"grad_norm": 0.38096755743026733,
"learning_rate": 7.996958746346812e-06,
"loss": 0.4483,
"step": 1019
},
{
"epoch": 1.0987432675044884,
"grad_norm": 0.37238049507141113,
"learning_rate": 7.991937017252127e-06,
"loss": 0.4116,
"step": 1020
},
{
"epoch": 1.0998204667863556,
"grad_norm": 0.36288678646087646,
"learning_rate": 7.986910582322663e-06,
"loss": 0.39,
"step": 1021
},
{
"epoch": 1.1008976660682226,
"grad_norm": 0.3652805685997009,
"learning_rate": 7.981879449464191e-06,
"loss": 0.3878,
"step": 1022
},
{
"epoch": 1.1019748653500898,
"grad_norm": 0.40952664613723755,
"learning_rate": 7.976843626589876e-06,
"loss": 0.4132,
"step": 1023
},
{
"epoch": 1.1030520646319568,
"grad_norm": 0.3170050382614136,
"learning_rate": 7.971803121620252e-06,
"loss": 0.3741,
"step": 1024
},
{
"epoch": 1.104129263913824,
"grad_norm": 0.4450155794620514,
"learning_rate": 7.966757942483224e-06,
"loss": 0.4434,
"step": 1025
},
{
"epoch": 1.1052064631956913,
"grad_norm": 0.4054987132549286,
"learning_rate": 7.96170809711404e-06,
"loss": 0.3978,
"step": 1026
},
{
"epoch": 1.1062836624775583,
"grad_norm": 0.4028407633304596,
"learning_rate": 7.9566535934553e-06,
"loss": 0.3979,
"step": 1027
},
{
"epoch": 1.1073608617594255,
"grad_norm": 0.3763718605041504,
"learning_rate": 7.951594439456921e-06,
"loss": 0.4084,
"step": 1028
},
{
"epoch": 1.1084380610412927,
"grad_norm": 0.398004412651062,
"learning_rate": 7.946530643076138e-06,
"loss": 0.4045,
"step": 1029
},
{
"epoch": 1.1095152603231597,
"grad_norm": 0.3450349271297455,
"learning_rate": 7.941462212277484e-06,
"loss": 0.3927,
"step": 1030
},
{
"epoch": 1.110592459605027,
"grad_norm": 0.3936854302883148,
"learning_rate": 7.936389155032785e-06,
"loss": 0.4304,
"step": 1031
},
{
"epoch": 1.1116696588868942,
"grad_norm": 0.3582758605480194,
"learning_rate": 7.931311479321144e-06,
"loss": 0.3867,
"step": 1032
},
{
"epoch": 1.1127468581687612,
"grad_norm": 0.38478732109069824,
"learning_rate": 7.926229193128924e-06,
"loss": 0.4265,
"step": 1033
},
{
"epoch": 1.1138240574506284,
"grad_norm": 0.32597222924232483,
"learning_rate": 7.921142304449744e-06,
"loss": 0.3484,
"step": 1034
},
{
"epoch": 1.1149012567324954,
"grad_norm": 0.3714587092399597,
"learning_rate": 7.916050821284462e-06,
"loss": 0.43,
"step": 1035
},
{
"epoch": 1.1159784560143626,
"grad_norm": 0.31232473254203796,
"learning_rate": 7.910954751641157e-06,
"loss": 0.4043,
"step": 1036
},
{
"epoch": 1.1170556552962299,
"grad_norm": 0.32094812393188477,
"learning_rate": 7.905854103535128e-06,
"loss": 0.3947,
"step": 1037
},
{
"epoch": 1.1181328545780969,
"grad_norm": 0.3706560730934143,
"learning_rate": 7.90074888498887e-06,
"loss": 0.4713,
"step": 1038
},
{
"epoch": 1.119210053859964,
"grad_norm": 0.3783697187900543,
"learning_rate": 7.895639104032071e-06,
"loss": 0.3714,
"step": 1039
},
{
"epoch": 1.1202872531418313,
"grad_norm": 0.3523646295070648,
"learning_rate": 7.890524768701592e-06,
"loss": 0.428,
"step": 1040
},
{
"epoch": 1.1213644524236983,
"grad_norm": 0.37586677074432373,
"learning_rate": 7.88540588704146e-06,
"loss": 0.4014,
"step": 1041
},
{
"epoch": 1.1224416517055655,
"grad_norm": 0.36774709820747375,
"learning_rate": 7.880282467102847e-06,
"loss": 0.4297,
"step": 1042
},
{
"epoch": 1.1235188509874328,
"grad_norm": 0.3239872455596924,
"learning_rate": 7.87515451694407e-06,
"loss": 0.3849,
"step": 1043
},
{
"epoch": 1.1245960502692998,
"grad_norm": 0.41246679425239563,
"learning_rate": 7.870022044630569e-06,
"loss": 0.4338,
"step": 1044
},
{
"epoch": 1.125673249551167,
"grad_norm": 0.2990739941596985,
"learning_rate": 7.864885058234895e-06,
"loss": 0.3491,
"step": 1045
},
{
"epoch": 1.1267504488330342,
"grad_norm": 0.3962797224521637,
"learning_rate": 7.859743565836697e-06,
"loss": 0.4388,
"step": 1046
},
{
"epoch": 1.1278276481149012,
"grad_norm": 0.3545224964618683,
"learning_rate": 7.854597575522717e-06,
"loss": 0.4245,
"step": 1047
},
{
"epoch": 1.1289048473967684,
"grad_norm": 0.3659273087978363,
"learning_rate": 7.849447095386769e-06,
"loss": 0.3882,
"step": 1048
},
{
"epoch": 1.1299820466786357,
"grad_norm": 0.32875725626945496,
"learning_rate": 7.844292133529727e-06,
"loss": 0.4093,
"step": 1049
},
{
"epoch": 1.1310592459605027,
"grad_norm": 0.3866625130176544,
"learning_rate": 7.839132698059515e-06,
"loss": 0.4166,
"step": 1050
},
{
"epoch": 1.13213644524237,
"grad_norm": 0.38909971714019775,
"learning_rate": 7.833968797091094e-06,
"loss": 0.3943,
"step": 1051
},
{
"epoch": 1.133213644524237,
"grad_norm": 0.37287622690200806,
"learning_rate": 7.828800438746448e-06,
"loss": 0.4399,
"step": 1052
},
{
"epoch": 1.1342908438061041,
"grad_norm": 0.32692670822143555,
"learning_rate": 7.82362763115457e-06,
"loss": 0.3592,
"step": 1053
},
{
"epoch": 1.1353680430879713,
"grad_norm": 0.42668434977531433,
"learning_rate": 7.818450382451457e-06,
"loss": 0.4426,
"step": 1054
},
{
"epoch": 1.1364452423698383,
"grad_norm": 0.3298830986022949,
"learning_rate": 7.813268700780084e-06,
"loss": 0.3771,
"step": 1055
},
{
"epoch": 1.1375224416517056,
"grad_norm": 0.3884989321231842,
"learning_rate": 7.808082594290403e-06,
"loss": 0.43,
"step": 1056
},
{
"epoch": 1.1385996409335728,
"grad_norm": 0.3532108962535858,
"learning_rate": 7.80289207113932e-06,
"loss": 0.4357,
"step": 1057
},
{
"epoch": 1.1396768402154398,
"grad_norm": 0.3356606662273407,
"learning_rate": 7.797697139490694e-06,
"loss": 0.3986,
"step": 1058
},
{
"epoch": 1.140754039497307,
"grad_norm": 0.29802894592285156,
"learning_rate": 7.792497807515317e-06,
"loss": 0.3898,
"step": 1059
},
{
"epoch": 1.141831238779174,
"grad_norm": 0.3978642225265503,
"learning_rate": 7.787294083390898e-06,
"loss": 0.4507,
"step": 1060
},
{
"epoch": 1.1429084380610413,
"grad_norm": 0.34184718132019043,
"learning_rate": 7.782085975302055e-06,
"loss": 0.3965,
"step": 1061
},
{
"epoch": 1.1439856373429085,
"grad_norm": 0.3312958776950836,
"learning_rate": 7.776873491440308e-06,
"loss": 0.3956,
"step": 1062
},
{
"epoch": 1.1450628366247755,
"grad_norm": 0.36556777358055115,
"learning_rate": 7.77165664000405e-06,
"loss": 0.4046,
"step": 1063
},
{
"epoch": 1.1461400359066427,
"grad_norm": 0.39094069600105286,
"learning_rate": 7.766435429198547e-06,
"loss": 0.4484,
"step": 1064
},
{
"epoch": 1.14721723518851,
"grad_norm": 0.4019407629966736,
"learning_rate": 7.761209867235924e-06,
"loss": 0.4012,
"step": 1065
},
{
"epoch": 1.148294434470377,
"grad_norm": 0.32983553409576416,
"learning_rate": 7.755979962335149e-06,
"loss": 0.3952,
"step": 1066
},
{
"epoch": 1.1493716337522442,
"grad_norm": 0.3956191837787628,
"learning_rate": 7.750745722722017e-06,
"loss": 0.4241,
"step": 1067
},
{
"epoch": 1.1504488330341114,
"grad_norm": 0.4105871915817261,
"learning_rate": 7.745507156629145e-06,
"loss": 0.4112,
"step": 1068
},
{
"epoch": 1.1515260323159784,
"grad_norm": 0.3463190197944641,
"learning_rate": 7.740264272295954e-06,
"loss": 0.3904,
"step": 1069
},
{
"epoch": 1.1526032315978456,
"grad_norm": 0.4070665240287781,
"learning_rate": 7.735017077968652e-06,
"loss": 0.3769,
"step": 1070
},
{
"epoch": 1.1536804308797128,
"grad_norm": 0.373894065618515,
"learning_rate": 7.729765581900236e-06,
"loss": 0.4304,
"step": 1071
},
{
"epoch": 1.1547576301615798,
"grad_norm": 0.3551969528198242,
"learning_rate": 7.72450979235046e-06,
"loss": 0.4078,
"step": 1072
},
{
"epoch": 1.155834829443447,
"grad_norm": 0.3633342683315277,
"learning_rate": 7.719249717585833e-06,
"loss": 0.4228,
"step": 1073
},
{
"epoch": 1.1569120287253143,
"grad_norm": 0.3957405388355255,
"learning_rate": 7.713985365879607e-06,
"loss": 0.3653,
"step": 1074
},
{
"epoch": 1.1579892280071813,
"grad_norm": 0.4325840175151825,
"learning_rate": 7.708716745511757e-06,
"loss": 0.4429,
"step": 1075
},
{
"epoch": 1.1590664272890485,
"grad_norm": 0.34411635994911194,
"learning_rate": 7.703443864768976e-06,
"loss": 0.4174,
"step": 1076
},
{
"epoch": 1.1601436265709157,
"grad_norm": 0.36224544048309326,
"learning_rate": 7.698166731944653e-06,
"loss": 0.4144,
"step": 1077
},
{
"epoch": 1.1612208258527827,
"grad_norm": 0.3776390850543976,
"learning_rate": 7.69288535533887e-06,
"loss": 0.3908,
"step": 1078
},
{
"epoch": 1.16229802513465,
"grad_norm": 0.3394273519515991,
"learning_rate": 7.68759974325838e-06,
"loss": 0.3893,
"step": 1079
},
{
"epoch": 1.163375224416517,
"grad_norm": 0.3884154260158539,
"learning_rate": 7.6823099040166e-06,
"loss": 0.4181,
"step": 1080
},
{
"epoch": 1.1644524236983842,
"grad_norm": 0.42072558403015137,
"learning_rate": 7.677015845933596e-06,
"loss": 0.4164,
"step": 1081
},
{
"epoch": 1.1655296229802514,
"grad_norm": 0.39499741792678833,
"learning_rate": 7.671717577336062e-06,
"loss": 0.4016,
"step": 1082
},
{
"epoch": 1.1666068222621184,
"grad_norm": 0.3707713782787323,
"learning_rate": 7.666415106557329e-06,
"loss": 0.3994,
"step": 1083
},
{
"epoch": 1.1676840215439857,
"grad_norm": 0.4331463873386383,
"learning_rate": 7.661108441937321e-06,
"loss": 0.3845,
"step": 1084
},
{
"epoch": 1.1687612208258529,
"grad_norm": 0.37316879630088806,
"learning_rate": 7.655797591822573e-06,
"loss": 0.428,
"step": 1085
},
{
"epoch": 1.1698384201077199,
"grad_norm": 0.38761723041534424,
"learning_rate": 7.650482564566192e-06,
"loss": 0.3879,
"step": 1086
},
{
"epoch": 1.170915619389587,
"grad_norm": 0.36918097734451294,
"learning_rate": 7.645163368527863e-06,
"loss": 0.4015,
"step": 1087
},
{
"epoch": 1.171992818671454,
"grad_norm": 0.4075348377227783,
"learning_rate": 7.63984001207382e-06,
"loss": 0.4451,
"step": 1088
},
{
"epoch": 1.1730700179533213,
"grad_norm": 0.35503262281417847,
"learning_rate": 7.63451250357685e-06,
"loss": 0.4096,
"step": 1089
},
{
"epoch": 1.1741472172351886,
"grad_norm": 0.3352997303009033,
"learning_rate": 7.62918085141626e-06,
"loss": 0.3787,
"step": 1090
},
{
"epoch": 1.1752244165170556,
"grad_norm": 0.4400590658187866,
"learning_rate": 7.623845063977883e-06,
"loss": 0.4086,
"step": 1091
},
{
"epoch": 1.1763016157989228,
"grad_norm": 0.40944555401802063,
"learning_rate": 7.618505149654052e-06,
"loss": 0.4123,
"step": 1092
},
{
"epoch": 1.17737881508079,
"grad_norm": 0.3341138958930969,
"learning_rate": 7.613161116843592e-06,
"loss": 0.3994,
"step": 1093
},
{
"epoch": 1.178456014362657,
"grad_norm": 0.40557679533958435,
"learning_rate": 7.607812973951802e-06,
"loss": 0.4149,
"step": 1094
},
{
"epoch": 1.1795332136445242,
"grad_norm": 0.36699768900871277,
"learning_rate": 7.602460729390455e-06,
"loss": 0.4145,
"step": 1095
},
{
"epoch": 1.1806104129263915,
"grad_norm": 0.3463127315044403,
"learning_rate": 7.597104391577765e-06,
"loss": 0.3929,
"step": 1096
},
{
"epoch": 1.1816876122082585,
"grad_norm": 0.3644564151763916,
"learning_rate": 7.59174396893839e-06,
"loss": 0.3951,
"step": 1097
},
{
"epoch": 1.1827648114901257,
"grad_norm": 0.3460002541542053,
"learning_rate": 7.586379469903409e-06,
"loss": 0.3661,
"step": 1098
},
{
"epoch": 1.183842010771993,
"grad_norm": 0.39767634868621826,
"learning_rate": 7.581010902910316e-06,
"loss": 0.441,
"step": 1099
},
{
"epoch": 1.18491921005386,
"grad_norm": 0.3830143213272095,
"learning_rate": 7.575638276403003e-06,
"loss": 0.4086,
"step": 1100
},
{
"epoch": 1.1859964093357271,
"grad_norm": 0.3573359549045563,
"learning_rate": 7.570261598831743e-06,
"loss": 0.3944,
"step": 1101
},
{
"epoch": 1.1870736086175944,
"grad_norm": 0.35708191990852356,
"learning_rate": 7.564880878653183e-06,
"loss": 0.3755,
"step": 1102
},
{
"epoch": 1.1881508078994614,
"grad_norm": 0.4171488583087921,
"learning_rate": 7.559496124330334e-06,
"loss": 0.4145,
"step": 1103
},
{
"epoch": 1.1892280071813286,
"grad_norm": 0.3435444235801697,
"learning_rate": 7.55410734433254e-06,
"loss": 0.4272,
"step": 1104
},
{
"epoch": 1.1903052064631956,
"grad_norm": 0.37534239888191223,
"learning_rate": 7.548714547135487e-06,
"loss": 0.392,
"step": 1105
},
{
"epoch": 1.1913824057450628,
"grad_norm": 0.4662499725818634,
"learning_rate": 7.5433177412211765e-06,
"loss": 0.4038,
"step": 1106
},
{
"epoch": 1.19245960502693,
"grad_norm": 0.3813924789428711,
"learning_rate": 7.537916935077914e-06,
"loss": 0.4284,
"step": 1107
},
{
"epoch": 1.193536804308797,
"grad_norm": 0.3959798812866211,
"learning_rate": 7.532512137200296e-06,
"loss": 0.4041,
"step": 1108
},
{
"epoch": 1.1946140035906643,
"grad_norm": 0.3660750389099121,
"learning_rate": 7.5271033560892e-06,
"loss": 0.3628,
"step": 1109
},
{
"epoch": 1.1956912028725315,
"grad_norm": 0.412552148103714,
"learning_rate": 7.521690600251765e-06,
"loss": 0.4299,
"step": 1110
},
{
"epoch": 1.1967684021543985,
"grad_norm": 0.37209901213645935,
"learning_rate": 7.516273878201387e-06,
"loss": 0.4278,
"step": 1111
},
{
"epoch": 1.1978456014362657,
"grad_norm": 0.3807721734046936,
"learning_rate": 7.5108531984576945e-06,
"loss": 0.4098,
"step": 1112
},
{
"epoch": 1.1989228007181327,
"grad_norm": 0.3844991624355316,
"learning_rate": 7.505428569546542e-06,
"loss": 0.3867,
"step": 1113
},
{
"epoch": 1.2,
"grad_norm": 0.3307826519012451,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3747,
"step": 1114
},
{
"epoch": 1.2010771992818672,
"grad_norm": 0.37735849618911743,
"learning_rate": 7.494567498356332e-06,
"loss": 0.3997,
"step": 1115
},
{
"epoch": 1.2021543985637342,
"grad_norm": 0.4011041522026062,
"learning_rate": 7.489131073159988e-06,
"loss": 0.4328,
"step": 1116
},
{
"epoch": 1.2032315978456014,
"grad_norm": 0.39078488945961,
"learning_rate": 7.483690732961587e-06,
"loss": 0.4079,
"step": 1117
},
{
"epoch": 1.2043087971274686,
"grad_norm": 0.35534340143203735,
"learning_rate": 7.4782464863179085e-06,
"loss": 0.3727,
"step": 1118
},
{
"epoch": 1.2053859964093356,
"grad_norm": 0.3854304552078247,
"learning_rate": 7.472798341791877e-06,
"loss": 0.4314,
"step": 1119
},
{
"epoch": 1.2064631956912029,
"grad_norm": 0.4247446358203888,
"learning_rate": 7.467346307952544e-06,
"loss": 0.3992,
"step": 1120
},
{
"epoch": 1.20754039497307,
"grad_norm": 0.4052562415599823,
"learning_rate": 7.461890393375079e-06,
"loss": 0.3962,
"step": 1121
},
{
"epoch": 1.208617594254937,
"grad_norm": 0.38277021050453186,
"learning_rate": 7.456430606640757e-06,
"loss": 0.4331,
"step": 1122
},
{
"epoch": 1.2096947935368043,
"grad_norm": 0.4042188823223114,
"learning_rate": 7.450966956336946e-06,
"loss": 0.3912,
"step": 1123
},
{
"epoch": 1.2107719928186715,
"grad_norm": 0.3846050798892975,
"learning_rate": 7.445499451057083e-06,
"loss": 0.4148,
"step": 1124
},
{
"epoch": 1.2118491921005385,
"grad_norm": 0.3405422866344452,
"learning_rate": 7.4400280994006765e-06,
"loss": 0.4092,
"step": 1125
},
{
"epoch": 1.2129263913824058,
"grad_norm": 0.3947262763977051,
"learning_rate": 7.434552909973278e-06,
"loss": 0.4077,
"step": 1126
},
{
"epoch": 1.214003590664273,
"grad_norm": 0.4062022864818573,
"learning_rate": 7.429073891386479e-06,
"loss": 0.3901,
"step": 1127
},
{
"epoch": 1.21508078994614,
"grad_norm": 0.37780916690826416,
"learning_rate": 7.423591052257893e-06,
"loss": 0.4333,
"step": 1128
},
{
"epoch": 1.2161579892280072,
"grad_norm": 0.3507837951183319,
"learning_rate": 7.418104401211144e-06,
"loss": 0.4044,
"step": 1129
},
{
"epoch": 1.2172351885098744,
"grad_norm": 0.3761948049068451,
"learning_rate": 7.412613946875846e-06,
"loss": 0.4023,
"step": 1130
},
{
"epoch": 1.2183123877917414,
"grad_norm": 0.326219379901886,
"learning_rate": 7.407119697887603e-06,
"loss": 0.3822,
"step": 1131
},
{
"epoch": 1.2193895870736087,
"grad_norm": 0.42541274428367615,
"learning_rate": 7.4016216628879815e-06,
"loss": 0.4199,
"step": 1132
},
{
"epoch": 1.2204667863554757,
"grad_norm": 0.35826441645622253,
"learning_rate": 7.396119850524503e-06,
"loss": 0.4005,
"step": 1133
},
{
"epoch": 1.221543985637343,
"grad_norm": 0.2977360188961029,
"learning_rate": 7.390614269450633e-06,
"loss": 0.3382,
"step": 1134
},
{
"epoch": 1.2226211849192101,
"grad_norm": 0.39104917645454407,
"learning_rate": 7.385104928325766e-06,
"loss": 0.4011,
"step": 1135
},
{
"epoch": 1.2236983842010771,
"grad_norm": 0.3678983449935913,
"learning_rate": 7.379591835815204e-06,
"loss": 0.4436,
"step": 1136
},
{
"epoch": 1.2247755834829444,
"grad_norm": 0.3696475625038147,
"learning_rate": 7.374075000590155e-06,
"loss": 0.4438,
"step": 1137
},
{
"epoch": 1.2258527827648116,
"grad_norm": 0.34962400794029236,
"learning_rate": 7.36855443132771e-06,
"loss": 0.4115,
"step": 1138
},
{
"epoch": 1.2269299820466786,
"grad_norm": 0.37243911623954773,
"learning_rate": 7.363030136710837e-06,
"loss": 0.3826,
"step": 1139
},
{
"epoch": 1.2280071813285458,
"grad_norm": 0.34877726435661316,
"learning_rate": 7.357502125428359e-06,
"loss": 0.3973,
"step": 1140
},
{
"epoch": 1.2290843806104128,
"grad_norm": 0.37363243103027344,
"learning_rate": 7.351970406174951e-06,
"loss": 0.4348,
"step": 1141
},
{
"epoch": 1.23016157989228,
"grad_norm": 0.3667917251586914,
"learning_rate": 7.346434987651111e-06,
"loss": 0.3888,
"step": 1142
},
{
"epoch": 1.2312387791741473,
"grad_norm": 0.345958948135376,
"learning_rate": 7.3408958785631625e-06,
"loss": 0.3924,
"step": 1143
},
{
"epoch": 1.2323159784560143,
"grad_norm": 0.3950294852256775,
"learning_rate": 7.3353530876232315e-06,
"loss": 0.4324,
"step": 1144
},
{
"epoch": 1.2333931777378815,
"grad_norm": 0.35145139694213867,
"learning_rate": 7.329806623549236e-06,
"loss": 0.3679,
"step": 1145
},
{
"epoch": 1.2344703770197487,
"grad_norm": 0.3384093940258026,
"learning_rate": 7.324256495064866e-06,
"loss": 0.4611,
"step": 1146
},
{
"epoch": 1.2355475763016157,
"grad_norm": 0.3493618071079254,
"learning_rate": 7.318702710899584e-06,
"loss": 0.4004,
"step": 1147
},
{
"epoch": 1.236624775583483,
"grad_norm": 0.37390270829200745,
"learning_rate": 7.313145279788596e-06,
"loss": 0.4061,
"step": 1148
},
{
"epoch": 1.2377019748653502,
"grad_norm": 0.37359941005706787,
"learning_rate": 7.3075842104728445e-06,
"loss": 0.4111,
"step": 1149
},
{
"epoch": 1.2387791741472172,
"grad_norm": 0.3854544460773468,
"learning_rate": 7.302019511698998e-06,
"loss": 0.3844,
"step": 1150
},
{
"epoch": 1.2398563734290844,
"grad_norm": 0.4268665015697479,
"learning_rate": 7.296451192219427e-06,
"loss": 0.4398,
"step": 1151
},
{
"epoch": 1.2409335727109516,
"grad_norm": 0.3563729226589203,
"learning_rate": 7.290879260792203e-06,
"loss": 0.3937,
"step": 1152
},
{
"epoch": 1.2420107719928186,
"grad_norm": 0.4146507978439331,
"learning_rate": 7.285303726181077e-06,
"loss": 0.3851,
"step": 1153
},
{
"epoch": 1.2430879712746858,
"grad_norm": 0.3794476091861725,
"learning_rate": 7.279724597155463e-06,
"loss": 0.4188,
"step": 1154
},
{
"epoch": 1.244165170556553,
"grad_norm": 0.33285075426101685,
"learning_rate": 7.274141882490435e-06,
"loss": 0.4161,
"step": 1155
},
{
"epoch": 1.24524236983842,
"grad_norm": 0.3906962275505066,
"learning_rate": 7.2685555909667045e-06,
"loss": 0.3726,
"step": 1156
},
{
"epoch": 1.2463195691202873,
"grad_norm": 0.34079667925834656,
"learning_rate": 7.262965731370606e-06,
"loss": 0.4003,
"step": 1157
},
{
"epoch": 1.2473967684021545,
"grad_norm": 0.3254135251045227,
"learning_rate": 7.2573723124940876e-06,
"loss": 0.4046,
"step": 1158
},
{
"epoch": 1.2484739676840215,
"grad_norm": 0.36429592967033386,
"learning_rate": 7.251775343134695e-06,
"loss": 0.4357,
"step": 1159
},
{
"epoch": 1.2495511669658887,
"grad_norm": 0.36106106638908386,
"learning_rate": 7.246174832095562e-06,
"loss": 0.3967,
"step": 1160
},
{
"epoch": 1.2506283662477558,
"grad_norm": 0.3221151530742645,
"learning_rate": 7.2405707881853885e-06,
"loss": 0.3975,
"step": 1161
},
{
"epoch": 1.251705565529623,
"grad_norm": 0.3673859238624573,
"learning_rate": 7.23496322021843e-06,
"loss": 0.3945,
"step": 1162
},
{
"epoch": 1.2527827648114902,
"grad_norm": 0.3616444766521454,
"learning_rate": 7.22935213701449e-06,
"loss": 0.3969,
"step": 1163
},
{
"epoch": 1.2538599640933572,
"grad_norm": 0.3362361192703247,
"learning_rate": 7.223737547398898e-06,
"loss": 0.4019,
"step": 1164
},
{
"epoch": 1.2549371633752244,
"grad_norm": 0.34882697463035583,
"learning_rate": 7.218119460202499e-06,
"loss": 0.404,
"step": 1165
},
{
"epoch": 1.2560143626570914,
"grad_norm": 0.3654717206954956,
"learning_rate": 7.212497884261638e-06,
"loss": 0.4106,
"step": 1166
},
{
"epoch": 1.2570915619389587,
"grad_norm": 0.31606000661849976,
"learning_rate": 7.206872828418147e-06,
"loss": 0.3872,
"step": 1167
},
{
"epoch": 1.2581687612208259,
"grad_norm": 0.39388373494148254,
"learning_rate": 7.201244301519333e-06,
"loss": 0.4134,
"step": 1168
},
{
"epoch": 1.2592459605026929,
"grad_norm": 0.35163354873657227,
"learning_rate": 7.195612312417964e-06,
"loss": 0.3807,
"step": 1169
},
{
"epoch": 1.26032315978456,
"grad_norm": 0.37111741304397583,
"learning_rate": 7.189976869972249e-06,
"loss": 0.4145,
"step": 1170
},
{
"epoch": 1.2614003590664273,
"grad_norm": 0.32174402475357056,
"learning_rate": 7.184337983045831e-06,
"loss": 0.3836,
"step": 1171
},
{
"epoch": 1.2624775583482943,
"grad_norm": 0.36445340514183044,
"learning_rate": 7.17869566050777e-06,
"loss": 0.3976,
"step": 1172
},
{
"epoch": 1.2635547576301616,
"grad_norm": 0.33522090315818787,
"learning_rate": 7.1730499112325335e-06,
"loss": 0.3959,
"step": 1173
},
{
"epoch": 1.2646319569120288,
"grad_norm": 0.4076383709907532,
"learning_rate": 7.1674007440999706e-06,
"loss": 0.46,
"step": 1174
},
{
"epoch": 1.2657091561938958,
"grad_norm": 0.292697936296463,
"learning_rate": 7.161748167995312e-06,
"loss": 0.3617,
"step": 1175
},
{
"epoch": 1.266786355475763,
"grad_norm": 0.34674617648124695,
"learning_rate": 7.156092191809152e-06,
"loss": 0.4092,
"step": 1176
},
{
"epoch": 1.2678635547576302,
"grad_norm": 0.3220016360282898,
"learning_rate": 7.150432824437428e-06,
"loss": 0.3785,
"step": 1177
},
{
"epoch": 1.2689407540394972,
"grad_norm": 0.34475672245025635,
"learning_rate": 7.144770074781411e-06,
"loss": 0.4354,
"step": 1178
},
{
"epoch": 1.2700179533213645,
"grad_norm": 0.32504379749298096,
"learning_rate": 7.139103951747694e-06,
"loss": 0.4122,
"step": 1179
},
{
"epoch": 1.2710951526032317,
"grad_norm": 0.3236904442310333,
"learning_rate": 7.133434464248178e-06,
"loss": 0.3816,
"step": 1180
},
{
"epoch": 1.2721723518850987,
"grad_norm": 0.3493375778198242,
"learning_rate": 7.1277616212000524e-06,
"loss": 0.4048,
"step": 1181
},
{
"epoch": 1.273249551166966,
"grad_norm": 0.31458455324172974,
"learning_rate": 7.122085431525785e-06,
"loss": 0.3877,
"step": 1182
},
{
"epoch": 1.2743267504488331,
"grad_norm": 0.34635964035987854,
"learning_rate": 7.116405904153105e-06,
"loss": 0.4073,
"step": 1183
},
{
"epoch": 1.2754039497307001,
"grad_norm": 0.3543740510940552,
"learning_rate": 7.110723048014996e-06,
"loss": 0.4039,
"step": 1184
},
{
"epoch": 1.2764811490125674,
"grad_norm": 0.33583390712738037,
"learning_rate": 7.105036872049676e-06,
"loss": 0.393,
"step": 1185
},
{
"epoch": 1.2775583482944346,
"grad_norm": 0.38352057337760925,
"learning_rate": 7.09934738520058e-06,
"loss": 0.3927,
"step": 1186
},
{
"epoch": 1.2786355475763016,
"grad_norm": 0.3392017185688019,
"learning_rate": 7.093654596416357e-06,
"loss": 0.3783,
"step": 1187
},
{
"epoch": 1.2797127468581688,
"grad_norm": 0.4042876362800598,
"learning_rate": 7.0879585146508455e-06,
"loss": 0.4005,
"step": 1188
},
{
"epoch": 1.2807899461400358,
"grad_norm": 0.37154635787010193,
"learning_rate": 7.082259148863064e-06,
"loss": 0.4274,
"step": 1189
},
{
"epoch": 1.281867145421903,
"grad_norm": 0.34719568490982056,
"learning_rate": 7.076556508017196e-06,
"loss": 0.3759,
"step": 1190
},
{
"epoch": 1.2829443447037703,
"grad_norm": 0.3831841051578522,
"learning_rate": 7.0708506010825774e-06,
"loss": 0.4559,
"step": 1191
},
{
"epoch": 1.2840215439856373,
"grad_norm": 0.3383837938308716,
"learning_rate": 7.06514143703368e-06,
"loss": 0.3748,
"step": 1192
},
{
"epoch": 1.2850987432675045,
"grad_norm": 0.33272892236709595,
"learning_rate": 7.0594290248501e-06,
"loss": 0.387,
"step": 1193
},
{
"epoch": 1.2861759425493715,
"grad_norm": 0.35053086280822754,
"learning_rate": 7.053713373516538e-06,
"loss": 0.4227,
"step": 1194
},
{
"epoch": 1.2872531418312387,
"grad_norm": 0.3342570960521698,
"learning_rate": 7.0479944920227945e-06,
"loss": 0.4216,
"step": 1195
},
{
"epoch": 1.288330341113106,
"grad_norm": 0.33486494421958923,
"learning_rate": 7.042272389363749e-06,
"loss": 0.4162,
"step": 1196
},
{
"epoch": 1.289407540394973,
"grad_norm": 0.3178257644176483,
"learning_rate": 7.036547074539347e-06,
"loss": 0.3873,
"step": 1197
},
{
"epoch": 1.2904847396768402,
"grad_norm": 0.30604350566864014,
"learning_rate": 7.030818556554586e-06,
"loss": 0.3962,
"step": 1198
},
{
"epoch": 1.2915619389587074,
"grad_norm": 0.3792921304702759,
"learning_rate": 7.0250868444195e-06,
"loss": 0.3978,
"step": 1199
},
{
"epoch": 1.2926391382405744,
"grad_norm": 0.31305480003356934,
"learning_rate": 7.019351947149149e-06,
"loss": 0.4062,
"step": 1200
},
{
"epoch": 1.2937163375224416,
"grad_norm": 0.31872621178627014,
"learning_rate": 7.013613873763603e-06,
"loss": 0.3575,
"step": 1201
},
{
"epoch": 1.2947935368043089,
"grad_norm": 0.3531589210033417,
"learning_rate": 7.007872633287926e-06,
"loss": 0.4109,
"step": 1202
},
{
"epoch": 1.2958707360861759,
"grad_norm": 0.32369256019592285,
"learning_rate": 7.002128234752167e-06,
"loss": 0.402,
"step": 1203
},
{
"epoch": 1.296947935368043,
"grad_norm": 0.3656080961227417,
"learning_rate": 6.996380687191335e-06,
"loss": 0.3856,
"step": 1204
},
{
"epoch": 1.2980251346499103,
"grad_norm": 0.38284584879875183,
"learning_rate": 6.990629999645399e-06,
"loss": 0.4304,
"step": 1205
},
{
"epoch": 1.2991023339317773,
"grad_norm": 0.29890871047973633,
"learning_rate": 6.984876181159261e-06,
"loss": 0.4062,
"step": 1206
},
{
"epoch": 1.3001795332136445,
"grad_norm": 0.381234735250473,
"learning_rate": 6.979119240782753e-06,
"loss": 0.3795,
"step": 1207
},
{
"epoch": 1.3012567324955118,
"grad_norm": 0.356242835521698,
"learning_rate": 6.973359187570614e-06,
"loss": 0.3962,
"step": 1208
},
{
"epoch": 1.3023339317773788,
"grad_norm": 0.34831711649894714,
"learning_rate": 6.9675960305824785e-06,
"loss": 0.4346,
"step": 1209
},
{
"epoch": 1.303411131059246,
"grad_norm": 0.3573039770126343,
"learning_rate": 6.9618297788828635e-06,
"loss": 0.4217,
"step": 1210
},
{
"epoch": 1.3044883303411132,
"grad_norm": 0.4218464195728302,
"learning_rate": 6.956060441541155e-06,
"loss": 0.3697,
"step": 1211
},
{
"epoch": 1.3055655296229802,
"grad_norm": 0.3260680139064789,
"learning_rate": 6.9502880276315885e-06,
"loss": 0.4182,
"step": 1212
},
{
"epoch": 1.3066427289048475,
"grad_norm": 0.3798454999923706,
"learning_rate": 6.9445125462332455e-06,
"loss": 0.422,
"step": 1213
},
{
"epoch": 1.3077199281867147,
"grad_norm": 0.412260502576828,
"learning_rate": 6.9387340064300234e-06,
"loss": 0.4255,
"step": 1214
},
{
"epoch": 1.3087971274685817,
"grad_norm": 0.4004683792591095,
"learning_rate": 6.932952417310634e-06,
"loss": 0.4081,
"step": 1215
},
{
"epoch": 1.309874326750449,
"grad_norm": 0.34901517629623413,
"learning_rate": 6.927167787968589e-06,
"loss": 0.4045,
"step": 1216
},
{
"epoch": 1.310951526032316,
"grad_norm": 0.41624924540519714,
"learning_rate": 6.9213801275021744e-06,
"loss": 0.4067,
"step": 1217
},
{
"epoch": 1.3120287253141831,
"grad_norm": 0.36287832260131836,
"learning_rate": 6.915589445014448e-06,
"loss": 0.3913,
"step": 1218
},
{
"epoch": 1.3131059245960501,
"grad_norm": 0.375931054353714,
"learning_rate": 6.909795749613223e-06,
"loss": 0.4075,
"step": 1219
},
{
"epoch": 1.3141831238779174,
"grad_norm": 0.3683956563472748,
"learning_rate": 6.903999050411046e-06,
"loss": 0.3901,
"step": 1220
},
{
"epoch": 1.3152603231597846,
"grad_norm": 0.38740620017051697,
"learning_rate": 6.89819935652519e-06,
"loss": 0.3912,
"step": 1221
},
{
"epoch": 1.3163375224416516,
"grad_norm": 0.503844678401947,
"learning_rate": 6.892396677077641e-06,
"loss": 0.4379,
"step": 1222
},
{
"epoch": 1.3174147217235188,
"grad_norm": 0.3334852159023285,
"learning_rate": 6.886591021195077e-06,
"loss": 0.3847,
"step": 1223
},
{
"epoch": 1.318491921005386,
"grad_norm": 0.3811096251010895,
"learning_rate": 6.880782398008862e-06,
"loss": 0.4156,
"step": 1224
},
{
"epoch": 1.319569120287253,
"grad_norm": 0.4092799723148346,
"learning_rate": 6.874970816655021e-06,
"loss": 0.4026,
"step": 1225
},
{
"epoch": 1.3206463195691203,
"grad_norm": 0.4070848822593689,
"learning_rate": 6.8691562862742365e-06,
"loss": 0.4356,
"step": 1226
},
{
"epoch": 1.3217235188509875,
"grad_norm": 0.3308897018432617,
"learning_rate": 6.8633388160118265e-06,
"loss": 0.3495,
"step": 1227
},
{
"epoch": 1.3228007181328545,
"grad_norm": 0.345034122467041,
"learning_rate": 6.857518415017736e-06,
"loss": 0.4193,
"step": 1228
},
{
"epoch": 1.3238779174147217,
"grad_norm": 0.37432533502578735,
"learning_rate": 6.851695092446517e-06,
"loss": 0.4124,
"step": 1229
},
{
"epoch": 1.324955116696589,
"grad_norm": 0.3827821910381317,
"learning_rate": 6.8458688574573164e-06,
"loss": 0.4333,
"step": 1230
},
{
"epoch": 1.326032315978456,
"grad_norm": 0.31386035680770874,
"learning_rate": 6.840039719213864e-06,
"loss": 0.378,
"step": 1231
},
{
"epoch": 1.3271095152603232,
"grad_norm": 0.3906676769256592,
"learning_rate": 6.8342076868844556e-06,
"loss": 0.4205,
"step": 1232
},
{
"epoch": 1.3281867145421904,
"grad_norm": 0.3502320945262909,
"learning_rate": 6.828372769641938e-06,
"loss": 0.3764,
"step": 1233
},
{
"epoch": 1.3292639138240574,
"grad_norm": 0.39450767636299133,
"learning_rate": 6.822534976663695e-06,
"loss": 0.4607,
"step": 1234
},
{
"epoch": 1.3303411131059246,
"grad_norm": 0.34283024072647095,
"learning_rate": 6.816694317131634e-06,
"loss": 0.3987,
"step": 1235
},
{
"epoch": 1.3314183123877918,
"grad_norm": 0.3752940893173218,
"learning_rate": 6.8108508002321714e-06,
"loss": 0.3941,
"step": 1236
},
{
"epoch": 1.3324955116696588,
"grad_norm": 0.3372066915035248,
"learning_rate": 6.8050044351562185e-06,
"loss": 0.3973,
"step": 1237
},
{
"epoch": 1.333572710951526,
"grad_norm": 0.39815032482147217,
"learning_rate": 6.799155231099164e-06,
"loss": 0.4359,
"step": 1238
},
{
"epoch": 1.3346499102333933,
"grad_norm": 0.343666672706604,
"learning_rate": 6.7933031972608644e-06,
"loss": 0.3829,
"step": 1239
},
{
"epoch": 1.3357271095152603,
"grad_norm": 0.3752990663051605,
"learning_rate": 6.787448342845626e-06,
"loss": 0.3999,
"step": 1240
},
{
"epoch": 1.3368043087971275,
"grad_norm": 0.37065207958221436,
"learning_rate": 6.781590677062191e-06,
"loss": 0.405,
"step": 1241
},
{
"epoch": 1.3378815080789948,
"grad_norm": 0.3138620853424072,
"learning_rate": 6.775730209123722e-06,
"loss": 0.3885,
"step": 1242
},
{
"epoch": 1.3389587073608618,
"grad_norm": 0.3156231939792633,
"learning_rate": 6.769866948247793e-06,
"loss": 0.3818,
"step": 1243
},
{
"epoch": 1.340035906642729,
"grad_norm": 0.37582793831825256,
"learning_rate": 6.764000903656367e-06,
"loss": 0.4588,
"step": 1244
},
{
"epoch": 1.341113105924596,
"grad_norm": 0.3268308937549591,
"learning_rate": 6.758132084575791e-06,
"loss": 0.3828,
"step": 1245
},
{
"epoch": 1.3421903052064632,
"grad_norm": 0.30771052837371826,
"learning_rate": 6.752260500236764e-06,
"loss": 0.3833,
"step": 1246
},
{
"epoch": 1.3432675044883302,
"grad_norm": 0.3514450788497925,
"learning_rate": 6.746386159874348e-06,
"loss": 0.4094,
"step": 1247
},
{
"epoch": 1.3443447037701974,
"grad_norm": 0.38996556401252747,
"learning_rate": 6.740509072727931e-06,
"loss": 0.4503,
"step": 1248
},
{
"epoch": 1.3454219030520647,
"grad_norm": 0.32034119963645935,
"learning_rate": 6.734629248041226e-06,
"loss": 0.3973,
"step": 1249
},
{
"epoch": 1.3464991023339317,
"grad_norm": 0.34276941418647766,
"learning_rate": 6.728746695062249e-06,
"loss": 0.406,
"step": 1250
},
{
"epoch": 1.3475763016157989,
"grad_norm": 0.34551379084587097,
"learning_rate": 6.722861423043305e-06,
"loss": 0.383,
"step": 1251
},
{
"epoch": 1.3486535008976661,
"grad_norm": 0.40802478790283203,
"learning_rate": 6.716973441240982e-06,
"loss": 0.4247,
"step": 1252
},
{
"epoch": 1.3497307001795331,
"grad_norm": 0.343085378408432,
"learning_rate": 6.711082758916127e-06,
"loss": 0.4242,
"step": 1253
},
{
"epoch": 1.3508078994614003,
"grad_norm": 0.32430559396743774,
"learning_rate": 6.70518938533383e-06,
"loss": 0.3895,
"step": 1254
},
{
"epoch": 1.3518850987432676,
"grad_norm": 0.3160029351711273,
"learning_rate": 6.699293329763421e-06,
"loss": 0.36,
"step": 1255
},
{
"epoch": 1.3529622980251346,
"grad_norm": 0.34324607253074646,
"learning_rate": 6.693394601478447e-06,
"loss": 0.3892,
"step": 1256
},
{
"epoch": 1.3540394973070018,
"grad_norm": 0.3541221022605896,
"learning_rate": 6.687493209756652e-06,
"loss": 0.3938,
"step": 1257
},
{
"epoch": 1.355116696588869,
"grad_norm": 0.36511629819869995,
"learning_rate": 6.681589163879978e-06,
"loss": 0.442,
"step": 1258
},
{
"epoch": 1.356193895870736,
"grad_norm": 0.34376540780067444,
"learning_rate": 6.675682473134536e-06,
"loss": 0.3888,
"step": 1259
},
{
"epoch": 1.3572710951526032,
"grad_norm": 0.38606876134872437,
"learning_rate": 6.6697731468105985e-06,
"loss": 0.3869,
"step": 1260
},
{
"epoch": 1.3583482944344705,
"grad_norm": 0.3761198818683624,
"learning_rate": 6.663861194202588e-06,
"loss": 0.4477,
"step": 1261
},
{
"epoch": 1.3594254937163375,
"grad_norm": 0.32027143239974976,
"learning_rate": 6.657946624609046e-06,
"loss": 0.4213,
"step": 1262
},
{
"epoch": 1.3605026929982047,
"grad_norm": 0.36698633432388306,
"learning_rate": 6.6520294473326415e-06,
"loss": 0.3932,
"step": 1263
},
{
"epoch": 1.361579892280072,
"grad_norm": 0.3574357330799103,
"learning_rate": 6.64610967168014e-06,
"loss": 0.4246,
"step": 1264
},
{
"epoch": 1.362657091561939,
"grad_norm": 0.3355792760848999,
"learning_rate": 6.640187306962395e-06,
"loss": 0.3943,
"step": 1265
},
{
"epoch": 1.3637342908438062,
"grad_norm": 0.3679425120353699,
"learning_rate": 6.6342623624943325e-06,
"loss": 0.4204,
"step": 1266
},
{
"epoch": 1.3648114901256734,
"grad_norm": 0.35469403862953186,
"learning_rate": 6.6283348475949335e-06,
"loss": 0.4361,
"step": 1267
},
{
"epoch": 1.3658886894075404,
"grad_norm": 0.3616523742675781,
"learning_rate": 6.622404771587225e-06,
"loss": 0.3992,
"step": 1268
},
{
"epoch": 1.3669658886894076,
"grad_norm": 0.3479689657688141,
"learning_rate": 6.61647214379826e-06,
"loss": 0.4223,
"step": 1269
},
{
"epoch": 1.3680430879712746,
"grad_norm": 0.35383719205856323,
"learning_rate": 6.610536973559108e-06,
"loss": 0.4039,
"step": 1270
},
{
"epoch": 1.3691202872531418,
"grad_norm": 0.3851570188999176,
"learning_rate": 6.604599270204832e-06,
"loss": 0.3863,
"step": 1271
},
{
"epoch": 1.370197486535009,
"grad_norm": 0.3704424798488617,
"learning_rate": 6.598659043074487e-06,
"loss": 0.4074,
"step": 1272
},
{
"epoch": 1.371274685816876,
"grad_norm": 0.3516056537628174,
"learning_rate": 6.592716301511089e-06,
"loss": 0.4075,
"step": 1273
},
{
"epoch": 1.3723518850987433,
"grad_norm": 0.38077306747436523,
"learning_rate": 6.586771054861613e-06,
"loss": 0.3979,
"step": 1274
},
{
"epoch": 1.3734290843806103,
"grad_norm": 0.3695003092288971,
"learning_rate": 6.580823312476976e-06,
"loss": 0.4206,
"step": 1275
},
{
"epoch": 1.3745062836624775,
"grad_norm": 0.34240230917930603,
"learning_rate": 6.574873083712018e-06,
"loss": 0.4126,
"step": 1276
},
{
"epoch": 1.3755834829443447,
"grad_norm": 0.3861750066280365,
"learning_rate": 6.568920377925491e-06,
"loss": 0.4133,
"step": 1277
},
{
"epoch": 1.3766606822262117,
"grad_norm": 0.3501964211463928,
"learning_rate": 6.562965204480038e-06,
"loss": 0.4055,
"step": 1278
},
{
"epoch": 1.377737881508079,
"grad_norm": 0.42631974816322327,
"learning_rate": 6.55700757274219e-06,
"loss": 0.4126,
"step": 1279
},
{
"epoch": 1.3788150807899462,
"grad_norm": 0.37425652146339417,
"learning_rate": 6.5510474920823404e-06,
"loss": 0.4251,
"step": 1280
},
{
"epoch": 1.3798922800718132,
"grad_norm": 0.37494876980781555,
"learning_rate": 6.545084971874738e-06,
"loss": 0.3962,
"step": 1281
},
{
"epoch": 1.3809694793536804,
"grad_norm": 0.4501427114009857,
"learning_rate": 6.5391200214974645e-06,
"loss": 0.462,
"step": 1282
},
{
"epoch": 1.3820466786355476,
"grad_norm": 0.3058687150478363,
"learning_rate": 6.5331526503324235e-06,
"loss": 0.3501,
"step": 1283
},
{
"epoch": 1.3831238779174146,
"grad_norm": 0.3958328366279602,
"learning_rate": 6.527182867765333e-06,
"loss": 0.4446,
"step": 1284
},
{
"epoch": 1.3842010771992819,
"grad_norm": 0.32861238718032837,
"learning_rate": 6.5212106831856944e-06,
"loss": 0.4054,
"step": 1285
},
{
"epoch": 1.385278276481149,
"grad_norm": 0.43862593173980713,
"learning_rate": 6.515236105986795e-06,
"loss": 0.4333,
"step": 1286
},
{
"epoch": 1.386355475763016,
"grad_norm": 0.32525646686553955,
"learning_rate": 6.509259145565681e-06,
"loss": 0.3857,
"step": 1287
},
{
"epoch": 1.3874326750448833,
"grad_norm": 0.35433462262153625,
"learning_rate": 6.503279811323145e-06,
"loss": 0.4293,
"step": 1288
},
{
"epoch": 1.3885098743267505,
"grad_norm": 0.37304699420928955,
"learning_rate": 6.497298112663721e-06,
"loss": 0.3904,
"step": 1289
},
{
"epoch": 1.3895870736086176,
"grad_norm": 0.4006001055240631,
"learning_rate": 6.491314058995653e-06,
"loss": 0.4138,
"step": 1290
},
{
"epoch": 1.3906642728904848,
"grad_norm": 0.3498937487602234,
"learning_rate": 6.4853276597308955e-06,
"loss": 0.4247,
"step": 1291
},
{
"epoch": 1.391741472172352,
"grad_norm": 0.39775341749191284,
"learning_rate": 6.479338924285089e-06,
"loss": 0.398,
"step": 1292
},
{
"epoch": 1.392818671454219,
"grad_norm": 0.34264737367630005,
"learning_rate": 6.4733478620775515e-06,
"loss": 0.3777,
"step": 1293
},
{
"epoch": 1.3938958707360862,
"grad_norm": 0.3468000590801239,
"learning_rate": 6.467354482531254e-06,
"loss": 0.3946,
"step": 1294
},
{
"epoch": 1.3949730700179535,
"grad_norm": 0.3531480133533478,
"learning_rate": 6.46135879507282e-06,
"loss": 0.4241,
"step": 1295
},
{
"epoch": 1.3960502692998205,
"grad_norm": 0.32537776231765747,
"learning_rate": 6.455360809132497e-06,
"loss": 0.3745,
"step": 1296
},
{
"epoch": 1.3971274685816877,
"grad_norm": 0.34955230355262756,
"learning_rate": 6.449360534144154e-06,
"loss": 0.3936,
"step": 1297
},
{
"epoch": 1.3982046678635547,
"grad_norm": 0.3445831537246704,
"learning_rate": 6.443357979545254e-06,
"loss": 0.4084,
"step": 1298
},
{
"epoch": 1.399281867145422,
"grad_norm": 0.332506388425827,
"learning_rate": 6.437353154776848e-06,
"loss": 0.4327,
"step": 1299
},
{
"epoch": 1.400359066427289,
"grad_norm": 0.3073407709598541,
"learning_rate": 6.4313460692835586e-06,
"loss": 0.3687,
"step": 1300
},
{
"epoch": 1.4014362657091561,
"grad_norm": 0.36171579360961914,
"learning_rate": 6.425336732513564e-06,
"loss": 0.4231,
"step": 1301
},
{
"epoch": 1.4025134649910234,
"grad_norm": 0.29764875769615173,
"learning_rate": 6.419325153918581e-06,
"loss": 0.3665,
"step": 1302
},
{
"epoch": 1.4035906642728904,
"grad_norm": 0.372881680727005,
"learning_rate": 6.413311342953854e-06,
"loss": 0.4249,
"step": 1303
},
{
"epoch": 1.4046678635547576,
"grad_norm": 0.3532498776912689,
"learning_rate": 6.407295309078139e-06,
"loss": 0.4069,
"step": 1304
},
{
"epoch": 1.4057450628366248,
"grad_norm": 0.30739298462867737,
"learning_rate": 6.401277061753689e-06,
"loss": 0.3926,
"step": 1305
},
{
"epoch": 1.4068222621184918,
"grad_norm": 0.36615023016929626,
"learning_rate": 6.395256610446234e-06,
"loss": 0.4107,
"step": 1306
},
{
"epoch": 1.407899461400359,
"grad_norm": 0.31940293312072754,
"learning_rate": 6.389233964624977e-06,
"loss": 0.4299,
"step": 1307
},
{
"epoch": 1.4089766606822263,
"grad_norm": 0.3129923939704895,
"learning_rate": 6.383209133762569e-06,
"loss": 0.3991,
"step": 1308
},
{
"epoch": 1.4100538599640933,
"grad_norm": 0.35873693227767944,
"learning_rate": 6.377182127335096e-06,
"loss": 0.3943,
"step": 1309
},
{
"epoch": 1.4111310592459605,
"grad_norm": 0.3312750458717346,
"learning_rate": 6.3711529548220695e-06,
"loss": 0.3629,
"step": 1310
},
{
"epoch": 1.4122082585278277,
"grad_norm": 0.395020991563797,
"learning_rate": 6.365121625706405e-06,
"loss": 0.4153,
"step": 1311
},
{
"epoch": 1.4132854578096947,
"grad_norm": 0.303682804107666,
"learning_rate": 6.359088149474412e-06,
"loss": 0.3788,
"step": 1312
},
{
"epoch": 1.414362657091562,
"grad_norm": 0.3261646032333374,
"learning_rate": 6.3530525356157765e-06,
"loss": 0.4019,
"step": 1313
},
{
"epoch": 1.4154398563734292,
"grad_norm": 0.41472819447517395,
"learning_rate": 6.3470147936235485e-06,
"loss": 0.415,
"step": 1314
},
{
"epoch": 1.4165170556552962,
"grad_norm": 0.3044980764389038,
"learning_rate": 6.340974932994119e-06,
"loss": 0.3774,
"step": 1315
},
{
"epoch": 1.4175942549371634,
"grad_norm": 0.3422330319881439,
"learning_rate": 6.334932963227216e-06,
"loss": 0.4393,
"step": 1316
},
{
"epoch": 1.4186714542190306,
"grad_norm": 0.3625028431415558,
"learning_rate": 6.328888893825888e-06,
"loss": 0.4372,
"step": 1317
},
{
"epoch": 1.4197486535008976,
"grad_norm": 0.30885642766952515,
"learning_rate": 6.3228427342964785e-06,
"loss": 0.3729,
"step": 1318
},
{
"epoch": 1.4208258527827649,
"grad_norm": 0.33570152521133423,
"learning_rate": 6.316794494148625e-06,
"loss": 0.3929,
"step": 1319
},
{
"epoch": 1.421903052064632,
"grad_norm": 0.354067325592041,
"learning_rate": 6.310744182895231e-06,
"loss": 0.4338,
"step": 1320
},
{
"epoch": 1.422980251346499,
"grad_norm": 0.30694371461868286,
"learning_rate": 6.304691810052466e-06,
"loss": 0.3792,
"step": 1321
},
{
"epoch": 1.4240574506283663,
"grad_norm": 0.31922197341918945,
"learning_rate": 6.2986373851397305e-06,
"loss": 0.3964,
"step": 1322
},
{
"epoch": 1.4251346499102335,
"grad_norm": 0.3309083580970764,
"learning_rate": 6.292580917679665e-06,
"loss": 0.417,
"step": 1323
},
{
"epoch": 1.4262118491921005,
"grad_norm": 0.3300463557243347,
"learning_rate": 6.286522417198115e-06,
"loss": 0.4203,
"step": 1324
},
{
"epoch": 1.4272890484739678,
"grad_norm": 0.31575945019721985,
"learning_rate": 6.280461893224127e-06,
"loss": 0.3785,
"step": 1325
},
{
"epoch": 1.4283662477558348,
"grad_norm": 0.2900710701942444,
"learning_rate": 6.274399355289924e-06,
"loss": 0.3781,
"step": 1326
},
{
"epoch": 1.429443447037702,
"grad_norm": 0.30679771304130554,
"learning_rate": 6.2683348129309056e-06,
"loss": 0.3743,
"step": 1327
},
{
"epoch": 1.430520646319569,
"grad_norm": 0.33957985043525696,
"learning_rate": 6.262268275685617e-06,
"loss": 0.4142,
"step": 1328
},
{
"epoch": 1.4315978456014362,
"grad_norm": 0.31916379928588867,
"learning_rate": 6.256199753095745e-06,
"loss": 0.4061,
"step": 1329
},
{
"epoch": 1.4326750448833034,
"grad_norm": 0.33593031764030457,
"learning_rate": 6.250129254706099e-06,
"loss": 0.3854,
"step": 1330
},
{
"epoch": 1.4337522441651704,
"grad_norm": 0.32574328780174255,
"learning_rate": 6.244056790064591e-06,
"loss": 0.4029,
"step": 1331
},
{
"epoch": 1.4348294434470377,
"grad_norm": 0.33279579877853394,
"learning_rate": 6.237982368722232e-06,
"loss": 0.4166,
"step": 1332
},
{
"epoch": 1.435906642728905,
"grad_norm": 0.333748996257782,
"learning_rate": 6.231906000233108e-06,
"loss": 0.3701,
"step": 1333
},
{
"epoch": 1.436983842010772,
"grad_norm": 0.36953479051589966,
"learning_rate": 6.225827694154365e-06,
"loss": 0.4345,
"step": 1334
},
{
"epoch": 1.4380610412926391,
"grad_norm": 0.3151591718196869,
"learning_rate": 6.219747460046203e-06,
"loss": 0.3932,
"step": 1335
},
{
"epoch": 1.4391382405745063,
"grad_norm": 0.4083271920681,
"learning_rate": 6.213665307471846e-06,
"loss": 0.4524,
"step": 1336
},
{
"epoch": 1.4402154398563733,
"grad_norm": 0.3362996280193329,
"learning_rate": 6.207581245997544e-06,
"loss": 0.4069,
"step": 1337
},
{
"epoch": 1.4412926391382406,
"grad_norm": 0.3313988447189331,
"learning_rate": 6.201495285192543e-06,
"loss": 0.4126,
"step": 1338
},
{
"epoch": 1.4423698384201078,
"grad_norm": 0.3780890107154846,
"learning_rate": 6.1954074346290775e-06,
"loss": 0.3615,
"step": 1339
},
{
"epoch": 1.4434470377019748,
"grad_norm": 0.3669836223125458,
"learning_rate": 6.189317703882357e-06,
"loss": 0.4363,
"step": 1340
},
{
"epoch": 1.444524236983842,
"grad_norm": 0.3658630847930908,
"learning_rate": 6.183226102530547e-06,
"loss": 0.3859,
"step": 1341
},
{
"epoch": 1.4456014362657092,
"grad_norm": 0.34788578748703003,
"learning_rate": 6.177132640154754e-06,
"loss": 0.3998,
"step": 1342
},
{
"epoch": 1.4466786355475763,
"grad_norm": 0.3125394582748413,
"learning_rate": 6.171037326339011e-06,
"loss": 0.3862,
"step": 1343
},
{
"epoch": 1.4477558348294435,
"grad_norm": 0.3761058747768402,
"learning_rate": 6.164940170670266e-06,
"loss": 0.3958,
"step": 1344
},
{
"epoch": 1.4488330341113107,
"grad_norm": 0.42082053422927856,
"learning_rate": 6.1588411827383644e-06,
"loss": 0.458,
"step": 1345
},
{
"epoch": 1.4499102333931777,
"grad_norm": 0.34688693284988403,
"learning_rate": 6.152740372136028e-06,
"loss": 0.3899,
"step": 1346
},
{
"epoch": 1.450987432675045,
"grad_norm": 0.313841849565506,
"learning_rate": 6.1466377484588495e-06,
"loss": 0.3904,
"step": 1347
},
{
"epoch": 1.4520646319569122,
"grad_norm": 0.4089730679988861,
"learning_rate": 6.140533321305273e-06,
"loss": 0.4011,
"step": 1348
},
{
"epoch": 1.4531418312387792,
"grad_norm": 0.4075726568698883,
"learning_rate": 6.134427100276579e-06,
"loss": 0.4093,
"step": 1349
},
{
"epoch": 1.4542190305206464,
"grad_norm": 0.3309878706932068,
"learning_rate": 6.128319094976869e-06,
"loss": 0.4003,
"step": 1350
},
{
"epoch": 1.4552962298025134,
"grad_norm": 0.3432037830352783,
"learning_rate": 6.122209315013049e-06,
"loss": 0.4111,
"step": 1351
},
{
"epoch": 1.4563734290843806,
"grad_norm": 0.3966643810272217,
"learning_rate": 6.116097769994821e-06,
"loss": 0.3895,
"step": 1352
},
{
"epoch": 1.4574506283662478,
"grad_norm": 0.3343490660190582,
"learning_rate": 6.10998446953466e-06,
"loss": 0.4089,
"step": 1353
},
{
"epoch": 1.4585278276481148,
"grad_norm": 0.32433220744132996,
"learning_rate": 6.1038694232478e-06,
"loss": 0.3838,
"step": 1354
},
{
"epoch": 1.459605026929982,
"grad_norm": 0.35544490814208984,
"learning_rate": 6.097752640752227e-06,
"loss": 0.4038,
"step": 1355
},
{
"epoch": 1.460682226211849,
"grad_norm": 0.335077702999115,
"learning_rate": 6.0916341316686526e-06,
"loss": 0.3854,
"step": 1356
},
{
"epoch": 1.4617594254937163,
"grad_norm": 0.3600093126296997,
"learning_rate": 6.085513905620504e-06,
"loss": 0.4118,
"step": 1357
},
{
"epoch": 1.4628366247755835,
"grad_norm": 0.36083582043647766,
"learning_rate": 6.079391972233913e-06,
"loss": 0.3839,
"step": 1358
},
{
"epoch": 1.4639138240574505,
"grad_norm": 0.3288957476615906,
"learning_rate": 6.073268341137694e-06,
"loss": 0.3872,
"step": 1359
},
{
"epoch": 1.4649910233393177,
"grad_norm": 0.3327372372150421,
"learning_rate": 6.067143021963329e-06,
"loss": 0.3964,
"step": 1360
},
{
"epoch": 1.466068222621185,
"grad_norm": 0.34401968121528625,
"learning_rate": 6.061016024344962e-06,
"loss": 0.3825,
"step": 1361
},
{
"epoch": 1.467145421903052,
"grad_norm": 0.3438219428062439,
"learning_rate": 6.054887357919371e-06,
"loss": 0.3866,
"step": 1362
},
{
"epoch": 1.4682226211849192,
"grad_norm": 0.290068119764328,
"learning_rate": 6.048757032325959e-06,
"loss": 0.3764,
"step": 1363
},
{
"epoch": 1.4692998204667864,
"grad_norm": 0.3384719789028168,
"learning_rate": 6.042625057206742e-06,
"loss": 0.4162,
"step": 1364
},
{
"epoch": 1.4703770197486534,
"grad_norm": 0.341897577047348,
"learning_rate": 6.03649144220633e-06,
"loss": 0.4196,
"step": 1365
},
{
"epoch": 1.4714542190305206,
"grad_norm": 0.37803781032562256,
"learning_rate": 6.030356196971911e-06,
"loss": 0.4159,
"step": 1366
},
{
"epoch": 1.4725314183123879,
"grad_norm": 0.32955625653266907,
"learning_rate": 6.024219331153232e-06,
"loss": 0.381,
"step": 1367
},
{
"epoch": 1.4736086175942549,
"grad_norm": 0.34146472811698914,
"learning_rate": 6.018080854402599e-06,
"loss": 0.4089,
"step": 1368
},
{
"epoch": 1.474685816876122,
"grad_norm": 0.3598938286304474,
"learning_rate": 6.0119407763748465e-06,
"loss": 0.3758,
"step": 1369
},
{
"epoch": 1.4757630161579893,
"grad_norm": 0.3479284346103668,
"learning_rate": 6.005799106727324e-06,
"loss": 0.4202,
"step": 1370
},
{
"epoch": 1.4768402154398563,
"grad_norm": 0.31264403462409973,
"learning_rate": 5.999655855119893e-06,
"loss": 0.3895,
"step": 1371
},
{
"epoch": 1.4779174147217236,
"grad_norm": 0.33001115918159485,
"learning_rate": 5.993511031214895e-06,
"loss": 0.4032,
"step": 1372
},
{
"epoch": 1.4789946140035908,
"grad_norm": 0.29274216294288635,
"learning_rate": 5.987364644677148e-06,
"loss": 0.4061,
"step": 1373
},
{
"epoch": 1.4800718132854578,
"grad_norm": 0.30409663915634155,
"learning_rate": 5.98121670517393e-06,
"loss": 0.4165,
"step": 1374
},
{
"epoch": 1.481149012567325,
"grad_norm": 0.30471399426460266,
"learning_rate": 5.9750672223749574e-06,
"loss": 0.3834,
"step": 1375
},
{
"epoch": 1.4822262118491922,
"grad_norm": 0.3169211149215698,
"learning_rate": 5.968916205952374e-06,
"loss": 0.4026,
"step": 1376
},
{
"epoch": 1.4833034111310592,
"grad_norm": 0.3480145335197449,
"learning_rate": 5.962763665580741e-06,
"loss": 0.4518,
"step": 1377
},
{
"epoch": 1.4843806104129265,
"grad_norm": 0.28615716099739075,
"learning_rate": 5.95660961093701e-06,
"loss": 0.3795,
"step": 1378
},
{
"epoch": 1.4854578096947935,
"grad_norm": 0.3205227255821228,
"learning_rate": 5.950454051700519e-06,
"loss": 0.3821,
"step": 1379
},
{
"epoch": 1.4865350089766607,
"grad_norm": 0.3589239716529846,
"learning_rate": 5.944296997552968e-06,
"loss": 0.4231,
"step": 1380
},
{
"epoch": 1.4876122082585277,
"grad_norm": 0.33773183822631836,
"learning_rate": 5.938138458178414e-06,
"loss": 0.4315,
"step": 1381
},
{
"epoch": 1.488689407540395,
"grad_norm": 0.32655930519104004,
"learning_rate": 5.931978443263247e-06,
"loss": 0.3883,
"step": 1382
},
{
"epoch": 1.4897666068222621,
"grad_norm": 0.3508589565753937,
"learning_rate": 5.9258169624961745e-06,
"loss": 0.4055,
"step": 1383
},
{
"epoch": 1.4908438061041291,
"grad_norm": 0.3352316915988922,
"learning_rate": 5.919654025568216e-06,
"loss": 0.3968,
"step": 1384
},
{
"epoch": 1.4919210053859964,
"grad_norm": 0.30605775117874146,
"learning_rate": 5.9134896421726775e-06,
"loss": 0.3667,
"step": 1385
},
{
"epoch": 1.4929982046678636,
"grad_norm": 0.33124783635139465,
"learning_rate": 5.90732382200514e-06,
"loss": 0.4374,
"step": 1386
},
{
"epoch": 1.4940754039497306,
"grad_norm": 0.32206031680107117,
"learning_rate": 5.901156574763445e-06,
"loss": 0.4078,
"step": 1387
},
{
"epoch": 1.4951526032315978,
"grad_norm": 0.3807339072227478,
"learning_rate": 5.894987910147679e-06,
"loss": 0.4221,
"step": 1388
},
{
"epoch": 1.496229802513465,
"grad_norm": 0.2875092327594757,
"learning_rate": 5.8888178378601565e-06,
"loss": 0.3774,
"step": 1389
},
{
"epoch": 1.497307001795332,
"grad_norm": 0.3753995895385742,
"learning_rate": 5.882646367605409e-06,
"loss": 0.3996,
"step": 1390
},
{
"epoch": 1.4983842010771993,
"grad_norm": 0.35869738459587097,
"learning_rate": 5.876473509090164e-06,
"loss": 0.3722,
"step": 1391
},
{
"epoch": 1.4994614003590665,
"grad_norm": 0.34174516797065735,
"learning_rate": 5.87029927202333e-06,
"loss": 0.4506,
"step": 1392
},
{
"epoch": 1.5005385996409335,
"grad_norm": 0.3361514210700989,
"learning_rate": 5.864123666115991e-06,
"loss": 0.3368,
"step": 1393
},
{
"epoch": 1.5016157989228007,
"grad_norm": 0.37364038825035095,
"learning_rate": 5.85794670108138e-06,
"loss": 0.4076,
"step": 1394
},
{
"epoch": 1.502692998204668,
"grad_norm": 0.3185519278049469,
"learning_rate": 5.851768386634863e-06,
"loss": 0.4179,
"step": 1395
},
{
"epoch": 1.503770197486535,
"grad_norm": 0.3299560844898224,
"learning_rate": 5.845588732493936e-06,
"loss": 0.3884,
"step": 1396
},
{
"epoch": 1.5048473967684022,
"grad_norm": 0.3319254517555237,
"learning_rate": 5.839407748378202e-06,
"loss": 0.3817,
"step": 1397
},
{
"epoch": 1.5059245960502694,
"grad_norm": 0.30551543831825256,
"learning_rate": 5.8332254440093486e-06,
"loss": 0.4472,
"step": 1398
},
{
"epoch": 1.5070017953321364,
"grad_norm": 0.34325891733169556,
"learning_rate": 5.827041829111144e-06,
"loss": 0.4122,
"step": 1399
},
{
"epoch": 1.5080789946140036,
"grad_norm": 0.3262201249599457,
"learning_rate": 5.8208569134094205e-06,
"loss": 0.3872,
"step": 1400
},
{
"epoch": 1.5091561938958709,
"grad_norm": 0.3270936608314514,
"learning_rate": 5.814670706632054e-06,
"loss": 0.4357,
"step": 1401
},
{
"epoch": 1.5102333931777379,
"grad_norm": 0.3235735595226288,
"learning_rate": 5.808483218508949e-06,
"loss": 0.4109,
"step": 1402
},
{
"epoch": 1.5113105924596049,
"grad_norm": 0.354521781206131,
"learning_rate": 5.8022944587720285e-06,
"loss": 0.4008,
"step": 1403
},
{
"epoch": 1.5123877917414723,
"grad_norm": 0.31210362911224365,
"learning_rate": 5.796104437155213e-06,
"loss": 0.371,
"step": 1404
},
{
"epoch": 1.5134649910233393,
"grad_norm": 0.31420016288757324,
"learning_rate": 5.78991316339441e-06,
"loss": 0.4104,
"step": 1405
},
{
"epoch": 1.5145421903052063,
"grad_norm": 0.3347276449203491,
"learning_rate": 5.7837206472274955e-06,
"loss": 0.4055,
"step": 1406
},
{
"epoch": 1.5156193895870738,
"grad_norm": 0.3168209195137024,
"learning_rate": 5.777526898394298e-06,
"loss": 0.4153,
"step": 1407
},
{
"epoch": 1.5166965888689408,
"grad_norm": 0.32604894042015076,
"learning_rate": 5.7713319266365886e-06,
"loss": 0.4178,
"step": 1408
},
{
"epoch": 1.5177737881508078,
"grad_norm": 0.3354335427284241,
"learning_rate": 5.765135741698058e-06,
"loss": 0.3787,
"step": 1409
},
{
"epoch": 1.518850987432675,
"grad_norm": 0.3192979395389557,
"learning_rate": 5.758938353324308e-06,
"loss": 0.4114,
"step": 1410
},
{
"epoch": 1.5199281867145422,
"grad_norm": 0.30226173996925354,
"learning_rate": 5.75273977126283e-06,
"loss": 0.3713,
"step": 1411
},
{
"epoch": 1.5210053859964092,
"grad_norm": 0.33431458473205566,
"learning_rate": 5.746540005262994e-06,
"loss": 0.4055,
"step": 1412
},
{
"epoch": 1.5220825852782764,
"grad_norm": 0.33149486780166626,
"learning_rate": 5.740339065076036e-06,
"loss": 0.3925,
"step": 1413
},
{
"epoch": 1.5231597845601437,
"grad_norm": 0.3282601237297058,
"learning_rate": 5.734136960455035e-06,
"loss": 0.4061,
"step": 1414
},
{
"epoch": 1.5242369838420107,
"grad_norm": 0.3495166599750519,
"learning_rate": 5.727933701154899e-06,
"loss": 0.3943,
"step": 1415
},
{
"epoch": 1.525314183123878,
"grad_norm": 0.325663685798645,
"learning_rate": 5.721729296932358e-06,
"loss": 0.4149,
"step": 1416
},
{
"epoch": 1.5263913824057451,
"grad_norm": 0.34505152702331543,
"learning_rate": 5.71552375754594e-06,
"loss": 0.4459,
"step": 1417
},
{
"epoch": 1.5274685816876121,
"grad_norm": 0.3084740936756134,
"learning_rate": 5.709317092755956e-06,
"loss": 0.3692,
"step": 1418
},
{
"epoch": 1.5285457809694794,
"grad_norm": 0.3517147898674011,
"learning_rate": 5.703109312324493e-06,
"loss": 0.4125,
"step": 1419
},
{
"epoch": 1.5296229802513466,
"grad_norm": 0.3403216600418091,
"learning_rate": 5.696900426015386e-06,
"loss": 0.3974,
"step": 1420
},
{
"epoch": 1.5307001795332136,
"grad_norm": 0.27224382758140564,
"learning_rate": 5.6906904435942145e-06,
"loss": 0.3477,
"step": 1421
},
{
"epoch": 1.5317773788150808,
"grad_norm": 0.36993709206581116,
"learning_rate": 5.68447937482828e-06,
"loss": 0.4152,
"step": 1422
},
{
"epoch": 1.532854578096948,
"grad_norm": 0.3202926814556122,
"learning_rate": 5.678267229486592e-06,
"loss": 0.4153,
"step": 1423
},
{
"epoch": 1.533931777378815,
"grad_norm": 0.2956278324127197,
"learning_rate": 5.672054017339855e-06,
"loss": 0.401,
"step": 1424
},
{
"epoch": 1.5350089766606823,
"grad_norm": 0.29878416657447815,
"learning_rate": 5.66583974816045e-06,
"loss": 0.3923,
"step": 1425
},
{
"epoch": 1.5360861759425495,
"grad_norm": 0.3247506320476532,
"learning_rate": 5.659624431722421e-06,
"loss": 0.4041,
"step": 1426
},
{
"epoch": 1.5371633752244165,
"grad_norm": 0.310533344745636,
"learning_rate": 5.653408077801459e-06,
"loss": 0.4062,
"step": 1427
},
{
"epoch": 1.5382405745062837,
"grad_norm": 0.3016843795776367,
"learning_rate": 5.647190696174886e-06,
"loss": 0.3877,
"step": 1428
},
{
"epoch": 1.539317773788151,
"grad_norm": 0.34463533759117126,
"learning_rate": 5.640972296621644e-06,
"loss": 0.4253,
"step": 1429
},
{
"epoch": 1.540394973070018,
"grad_norm": 0.32542088627815247,
"learning_rate": 5.6347528889222715e-06,
"loss": 0.3706,
"step": 1430
},
{
"epoch": 1.541472172351885,
"grad_norm": 0.32016703486442566,
"learning_rate": 5.628532482858894e-06,
"loss": 0.4277,
"step": 1431
},
{
"epoch": 1.5425493716337524,
"grad_norm": 0.367384135723114,
"learning_rate": 5.622311088215209e-06,
"loss": 0.4243,
"step": 1432
},
{
"epoch": 1.5436265709156194,
"grad_norm": 0.351962685585022,
"learning_rate": 5.616088714776469e-06,
"loss": 0.4034,
"step": 1433
},
{
"epoch": 1.5447037701974864,
"grad_norm": 0.34793248772621155,
"learning_rate": 5.609865372329461e-06,
"loss": 0.4189,
"step": 1434
},
{
"epoch": 1.5457809694793538,
"grad_norm": 0.34114158153533936,
"learning_rate": 5.603641070662502e-06,
"loss": 0.3852,
"step": 1435
},
{
"epoch": 1.5468581687612208,
"grad_norm": 0.3659258186817169,
"learning_rate": 5.597415819565416e-06,
"loss": 0.4205,
"step": 1436
},
{
"epoch": 1.5479353680430878,
"grad_norm": 0.32382774353027344,
"learning_rate": 5.591189628829519e-06,
"loss": 0.3746,
"step": 1437
},
{
"epoch": 1.549012567324955,
"grad_norm": 0.35888931155204773,
"learning_rate": 5.584962508247605e-06,
"loss": 0.4222,
"step": 1438
},
{
"epoch": 1.5500897666068223,
"grad_norm": 0.34174293279647827,
"learning_rate": 5.578734467613933e-06,
"loss": 0.3859,
"step": 1439
},
{
"epoch": 1.5511669658886893,
"grad_norm": 0.32036033272743225,
"learning_rate": 5.572505516724207e-06,
"loss": 0.3988,
"step": 1440
},
{
"epoch": 1.5522441651705565,
"grad_norm": 0.35757505893707275,
"learning_rate": 5.56627566537556e-06,
"loss": 0.4279,
"step": 1441
},
{
"epoch": 1.5533213644524237,
"grad_norm": 0.26979249715805054,
"learning_rate": 5.560044923366549e-06,
"loss": 0.3469,
"step": 1442
},
{
"epoch": 1.5543985637342908,
"grad_norm": 0.33907294273376465,
"learning_rate": 5.5538133004971216e-06,
"loss": 0.4008,
"step": 1443
},
{
"epoch": 1.555475763016158,
"grad_norm": 0.3330419659614563,
"learning_rate": 5.547580806568621e-06,
"loss": 0.4267,
"step": 1444
},
{
"epoch": 1.5565529622980252,
"grad_norm": 0.36036917567253113,
"learning_rate": 5.541347451383755e-06,
"loss": 0.4155,
"step": 1445
},
{
"epoch": 1.5576301615798922,
"grad_norm": 0.3305310904979706,
"learning_rate": 5.535113244746585e-06,
"loss": 0.4039,
"step": 1446
},
{
"epoch": 1.5587073608617594,
"grad_norm": 0.34608691930770874,
"learning_rate": 5.528878196462514e-06,
"loss": 0.4116,
"step": 1447
},
{
"epoch": 1.5597845601436267,
"grad_norm": 0.36932167410850525,
"learning_rate": 5.522642316338268e-06,
"loss": 0.4179,
"step": 1448
},
{
"epoch": 1.5608617594254937,
"grad_norm": 0.33676591515541077,
"learning_rate": 5.516405614181883e-06,
"loss": 0.4099,
"step": 1449
},
{
"epoch": 1.5619389587073609,
"grad_norm": 0.3538258969783783,
"learning_rate": 5.5101680998026855e-06,
"loss": 0.4211,
"step": 1450
},
{
"epoch": 1.563016157989228,
"grad_norm": 0.3520103693008423,
"learning_rate": 5.503929783011279e-06,
"loss": 0.4095,
"step": 1451
},
{
"epoch": 1.564093357271095,
"grad_norm": 0.39654335379600525,
"learning_rate": 5.497690673619532e-06,
"loss": 0.383,
"step": 1452
},
{
"epoch": 1.5651705565529623,
"grad_norm": 0.3446529805660248,
"learning_rate": 5.4914507814405596e-06,
"loss": 0.4224,
"step": 1453
},
{
"epoch": 1.5662477558348296,
"grad_norm": 0.3643217384815216,
"learning_rate": 5.485210116288704e-06,
"loss": 0.415,
"step": 1454
},
{
"epoch": 1.5673249551166966,
"grad_norm": 0.33305594325065613,
"learning_rate": 5.478968687979527e-06,
"loss": 0.3999,
"step": 1455
},
{
"epoch": 1.5684021543985638,
"grad_norm": 0.31847333908081055,
"learning_rate": 5.472726506329789e-06,
"loss": 0.3692,
"step": 1456
},
{
"epoch": 1.569479353680431,
"grad_norm": 0.3522320091724396,
"learning_rate": 5.466483581157437e-06,
"loss": 0.4035,
"step": 1457
},
{
"epoch": 1.570556552962298,
"grad_norm": 0.35457131266593933,
"learning_rate": 5.460239922281586e-06,
"loss": 0.4142,
"step": 1458
},
{
"epoch": 1.571633752244165,
"grad_norm": 0.3094504773616791,
"learning_rate": 5.453995539522503e-06,
"loss": 0.4002,
"step": 1459
},
{
"epoch": 1.5727109515260325,
"grad_norm": 0.32812392711639404,
"learning_rate": 5.447750442701598e-06,
"loss": 0.3666,
"step": 1460
},
{
"epoch": 1.5737881508078995,
"grad_norm": 0.3192622661590576,
"learning_rate": 5.441504641641403e-06,
"loss": 0.3929,
"step": 1461
},
{
"epoch": 1.5748653500897665,
"grad_norm": 0.31926393508911133,
"learning_rate": 5.435258146165554e-06,
"loss": 0.401,
"step": 1462
},
{
"epoch": 1.575942549371634,
"grad_norm": 0.3556506931781769,
"learning_rate": 5.429010966098782e-06,
"loss": 0.3921,
"step": 1463
},
{
"epoch": 1.577019748653501,
"grad_norm": 0.3289359211921692,
"learning_rate": 5.4227631112668955e-06,
"loss": 0.3963,
"step": 1464
},
{
"epoch": 1.578096947935368,
"grad_norm": 0.35782623291015625,
"learning_rate": 5.416514591496764e-06,
"loss": 0.4222,
"step": 1465
},
{
"epoch": 1.5791741472172351,
"grad_norm": 0.3453352153301239,
"learning_rate": 5.410265416616301e-06,
"loss": 0.3899,
"step": 1466
},
{
"epoch": 1.5802513464991024,
"grad_norm": 0.34165066480636597,
"learning_rate": 5.404015596454451e-06,
"loss": 0.3652,
"step": 1467
},
{
"epoch": 1.5813285457809694,
"grad_norm": 0.336681067943573,
"learning_rate": 5.397765140841174e-06,
"loss": 0.4057,
"step": 1468
},
{
"epoch": 1.5824057450628366,
"grad_norm": 0.3774108588695526,
"learning_rate": 5.391514059607431e-06,
"loss": 0.4302,
"step": 1469
},
{
"epoch": 1.5834829443447038,
"grad_norm": 0.32935065031051636,
"learning_rate": 5.3852623625851655e-06,
"loss": 0.3606,
"step": 1470
},
{
"epoch": 1.5845601436265708,
"grad_norm": 0.3437838852405548,
"learning_rate": 5.379010059607288e-06,
"loss": 0.3976,
"step": 1471
},
{
"epoch": 1.585637342908438,
"grad_norm": 0.34462642669677734,
"learning_rate": 5.372757160507663e-06,
"loss": 0.4161,
"step": 1472
},
{
"epoch": 1.5867145421903053,
"grad_norm": 0.4068163335323334,
"learning_rate": 5.366503675121095e-06,
"loss": 0.4021,
"step": 1473
},
{
"epoch": 1.5877917414721723,
"grad_norm": 0.3602428436279297,
"learning_rate": 5.360249613283308e-06,
"loss": 0.4126,
"step": 1474
},
{
"epoch": 1.5888689407540395,
"grad_norm": 0.3262472152709961,
"learning_rate": 5.353994984830934e-06,
"loss": 0.429,
"step": 1475
},
{
"epoch": 1.5899461400359067,
"grad_norm": 0.3102273643016815,
"learning_rate": 5.347739799601494e-06,
"loss": 0.3751,
"step": 1476
},
{
"epoch": 1.5910233393177737,
"grad_norm": 0.3460737466812134,
"learning_rate": 5.341484067433388e-06,
"loss": 0.3969,
"step": 1477
},
{
"epoch": 1.592100538599641,
"grad_norm": 0.345058798789978,
"learning_rate": 5.335227798165874e-06,
"loss": 0.4097,
"step": 1478
},
{
"epoch": 1.5931777378815082,
"grad_norm": 0.3214409351348877,
"learning_rate": 5.328971001639054e-06,
"loss": 0.4073,
"step": 1479
},
{
"epoch": 1.5942549371633752,
"grad_norm": 0.343375563621521,
"learning_rate": 5.322713687693862e-06,
"loss": 0.4327,
"step": 1480
},
{
"epoch": 1.5953321364452424,
"grad_norm": 0.34606754779815674,
"learning_rate": 5.3164558661720456e-06,
"loss": 0.424,
"step": 1481
},
{
"epoch": 1.5964093357271096,
"grad_norm": 0.38280874490737915,
"learning_rate": 5.310197546916149e-06,
"loss": 0.4247,
"step": 1482
},
{
"epoch": 1.5974865350089766,
"grad_norm": 0.339191734790802,
"learning_rate": 5.303938739769498e-06,
"loss": 0.3879,
"step": 1483
},
{
"epoch": 1.5985637342908436,
"grad_norm": 0.3148673474788666,
"learning_rate": 5.2976794545761886e-06,
"loss": 0.3802,
"step": 1484
},
{
"epoch": 1.599640933572711,
"grad_norm": 0.3606517016887665,
"learning_rate": 5.291419701181069e-06,
"loss": 0.4378,
"step": 1485
},
{
"epoch": 1.600718132854578,
"grad_norm": 0.35120633244514465,
"learning_rate": 5.285159489429722e-06,
"loss": 0.4015,
"step": 1486
},
{
"epoch": 1.601795332136445,
"grad_norm": 0.3325619101524353,
"learning_rate": 5.2788988291684506e-06,
"loss": 0.3774,
"step": 1487
},
{
"epoch": 1.6028725314183125,
"grad_norm": 0.3451593518257141,
"learning_rate": 5.272637730244265e-06,
"loss": 0.4173,
"step": 1488
},
{
"epoch": 1.6039497307001795,
"grad_norm": 0.36361247301101685,
"learning_rate": 5.266376202504866e-06,
"loss": 0.3992,
"step": 1489
},
{
"epoch": 1.6050269299820465,
"grad_norm": 0.32423025369644165,
"learning_rate": 5.260114255798627e-06,
"loss": 0.3862,
"step": 1490
},
{
"epoch": 1.6061041292639138,
"grad_norm": 0.32100334763526917,
"learning_rate": 5.25385189997458e-06,
"loss": 0.4275,
"step": 1491
},
{
"epoch": 1.607181328545781,
"grad_norm": 0.35108858346939087,
"learning_rate": 5.2475891448824025e-06,
"loss": 0.3976,
"step": 1492
},
{
"epoch": 1.608258527827648,
"grad_norm": 0.3189638555049896,
"learning_rate": 5.2413260003724e-06,
"loss": 0.397,
"step": 1493
},
{
"epoch": 1.6093357271095152,
"grad_norm": 0.3238120675086975,
"learning_rate": 5.235062476295488e-06,
"loss": 0.4195,
"step": 1494
},
{
"epoch": 1.6104129263913824,
"grad_norm": 0.3653884530067444,
"learning_rate": 5.228798582503181e-06,
"loss": 0.387,
"step": 1495
},
{
"epoch": 1.6114901256732495,
"grad_norm": 0.3390481173992157,
"learning_rate": 5.222534328847574e-06,
"loss": 0.3873,
"step": 1496
},
{
"epoch": 1.6125673249551167,
"grad_norm": 0.3277999758720398,
"learning_rate": 5.21626972518133e-06,
"loss": 0.3865,
"step": 1497
},
{
"epoch": 1.613644524236984,
"grad_norm": 0.3376871943473816,
"learning_rate": 5.21000478135766e-06,
"loss": 0.4334,
"step": 1498
},
{
"epoch": 1.614721723518851,
"grad_norm": 0.4035407602787018,
"learning_rate": 5.203739507230311e-06,
"loss": 0.4109,
"step": 1499
},
{
"epoch": 1.6157989228007181,
"grad_norm": 0.33631792664527893,
"learning_rate": 5.197473912653549e-06,
"loss": 0.4145,
"step": 1500
},
{
"epoch": 1.6168761220825854,
"grad_norm": 0.33172816038131714,
"learning_rate": 5.191208007482145e-06,
"loss": 0.3723,
"step": 1501
},
{
"epoch": 1.6179533213644524,
"grad_norm": 0.3129732310771942,
"learning_rate": 5.184941801571359e-06,
"loss": 0.3939,
"step": 1502
},
{
"epoch": 1.6190305206463196,
"grad_norm": 0.32043594121932983,
"learning_rate": 5.1786753047769235e-06,
"loss": 0.3864,
"step": 1503
},
{
"epoch": 1.6201077199281868,
"grad_norm": 0.38129621744155884,
"learning_rate": 5.172408526955025e-06,
"loss": 0.3873,
"step": 1504
},
{
"epoch": 1.6211849192100538,
"grad_norm": 0.3243739902973175,
"learning_rate": 5.166141477962298e-06,
"loss": 0.4044,
"step": 1505
},
{
"epoch": 1.622262118491921,
"grad_norm": 0.30711236596107483,
"learning_rate": 5.1598741676557995e-06,
"loss": 0.3538,
"step": 1506
},
{
"epoch": 1.6233393177737883,
"grad_norm": 0.3542765974998474,
"learning_rate": 5.153606605892999e-06,
"loss": 0.43,
"step": 1507
},
{
"epoch": 1.6244165170556553,
"grad_norm": 0.39987432956695557,
"learning_rate": 5.147338802531762e-06,
"loss": 0.4364,
"step": 1508
},
{
"epoch": 1.6254937163375225,
"grad_norm": 0.34654441475868225,
"learning_rate": 5.141070767430331e-06,
"loss": 0.4161,
"step": 1509
},
{
"epoch": 1.6265709156193897,
"grad_norm": 0.3391593098640442,
"learning_rate": 5.134802510447318e-06,
"loss": 0.4211,
"step": 1510
},
{
"epoch": 1.6276481149012567,
"grad_norm": 0.36817681789398193,
"learning_rate": 5.128534041441677e-06,
"loss": 0.3796,
"step": 1511
},
{
"epoch": 1.6287253141831237,
"grad_norm": 0.3822740614414215,
"learning_rate": 5.1222653702727035e-06,
"loss": 0.3863,
"step": 1512
},
{
"epoch": 1.6298025134649912,
"grad_norm": 0.3610847294330597,
"learning_rate": 5.115996506800004e-06,
"loss": 0.3947,
"step": 1513
},
{
"epoch": 1.6308797127468582,
"grad_norm": 0.3673597574234009,
"learning_rate": 5.109727460883496e-06,
"loss": 0.4001,
"step": 1514
},
{
"epoch": 1.6319569120287252,
"grad_norm": 0.36558011174201965,
"learning_rate": 5.103458242383371e-06,
"loss": 0.3957,
"step": 1515
},
{
"epoch": 1.6330341113105926,
"grad_norm": 0.38542303442955017,
"learning_rate": 5.097188861160103e-06,
"loss": 0.4457,
"step": 1516
},
{
"epoch": 1.6341113105924596,
"grad_norm": 0.3248066008090973,
"learning_rate": 5.09091932707442e-06,
"loss": 0.3801,
"step": 1517
},
{
"epoch": 1.6351885098743266,
"grad_norm": 0.3572705388069153,
"learning_rate": 5.084649649987285e-06,
"loss": 0.3779,
"step": 1518
},
{
"epoch": 1.6362657091561938,
"grad_norm": 0.3426295220851898,
"learning_rate": 5.078379839759895e-06,
"loss": 0.4289,
"step": 1519
},
{
"epoch": 1.637342908438061,
"grad_norm": 0.313820481300354,
"learning_rate": 5.072109906253646e-06,
"loss": 0.4179,
"step": 1520
},
{
"epoch": 1.638420107719928,
"grad_norm": 0.31881383061408997,
"learning_rate": 5.065839859330134e-06,
"loss": 0.3918,
"step": 1521
},
{
"epoch": 1.6394973070017953,
"grad_norm": 0.34397974610328674,
"learning_rate": 5.059569708851136e-06,
"loss": 0.3973,
"step": 1522
},
{
"epoch": 1.6405745062836625,
"grad_norm": 0.32578712701797485,
"learning_rate": 5.053299464678583e-06,
"loss": 0.4084,
"step": 1523
},
{
"epoch": 1.6416517055655295,
"grad_norm": 0.3264220058917999,
"learning_rate": 5.047029136674563e-06,
"loss": 0.4077,
"step": 1524
},
{
"epoch": 1.6427289048473968,
"grad_norm": 0.37533849477767944,
"learning_rate": 5.040758734701289e-06,
"loss": 0.4076,
"step": 1525
},
{
"epoch": 1.643806104129264,
"grad_norm": 0.2892257273197174,
"learning_rate": 5.034488268621094e-06,
"loss": 0.3809,
"step": 1526
},
{
"epoch": 1.644883303411131,
"grad_norm": 0.3342662751674652,
"learning_rate": 5.028217748296409e-06,
"loss": 0.4308,
"step": 1527
},
{
"epoch": 1.6459605026929982,
"grad_norm": 0.30961301922798157,
"learning_rate": 5.021947183589753e-06,
"loss": 0.401,
"step": 1528
},
{
"epoch": 1.6470377019748654,
"grad_norm": 0.3177073299884796,
"learning_rate": 5.015676584363716e-06,
"loss": 0.3879,
"step": 1529
},
{
"epoch": 1.6481149012567324,
"grad_norm": 0.32608547806739807,
"learning_rate": 5.009405960480937e-06,
"loss": 0.4081,
"step": 1530
},
{
"epoch": 1.6491921005385997,
"grad_norm": 0.3337622582912445,
"learning_rate": 5.003135321804098e-06,
"loss": 0.4059,
"step": 1531
},
{
"epoch": 1.6502692998204669,
"grad_norm": 0.33355656266212463,
"learning_rate": 4.996864678195903e-06,
"loss": 0.4241,
"step": 1532
},
{
"epoch": 1.6513464991023339,
"grad_norm": 0.3020173907279968,
"learning_rate": 4.990594039519064e-06,
"loss": 0.4117,
"step": 1533
},
{
"epoch": 1.6524236983842011,
"grad_norm": 0.2995854318141937,
"learning_rate": 4.984323415636285e-06,
"loss": 0.3731,
"step": 1534
},
{
"epoch": 1.6535008976660683,
"grad_norm": 0.3445095121860504,
"learning_rate": 4.9780528164102475e-06,
"loss": 0.4398,
"step": 1535
},
{
"epoch": 1.6545780969479353,
"grad_norm": 0.3377833366394043,
"learning_rate": 4.971782251703591e-06,
"loss": 0.417,
"step": 1536
},
{
"epoch": 1.6556552962298026,
"grad_norm": 0.30527958273887634,
"learning_rate": 4.965511731378909e-06,
"loss": 0.3804,
"step": 1537
},
{
"epoch": 1.6567324955116698,
"grad_norm": 0.31118062138557434,
"learning_rate": 4.959241265298713e-06,
"loss": 0.3935,
"step": 1538
},
{
"epoch": 1.6578096947935368,
"grad_norm": 0.34053924679756165,
"learning_rate": 4.95297086332544e-06,
"loss": 0.3933,
"step": 1539
},
{
"epoch": 1.6588868940754038,
"grad_norm": 0.3675541877746582,
"learning_rate": 4.946700535321419e-06,
"loss": 0.4748,
"step": 1540
},
{
"epoch": 1.6599640933572712,
"grad_norm": 0.29740291833877563,
"learning_rate": 4.940430291148866e-06,
"loss": 0.3722,
"step": 1541
},
{
"epoch": 1.6610412926391382,
"grad_norm": 0.33205488324165344,
"learning_rate": 4.9341601406698675e-06,
"loss": 0.3982,
"step": 1542
},
{
"epoch": 1.6621184919210052,
"grad_norm": 0.34859946370124817,
"learning_rate": 4.927890093746356e-06,
"loss": 0.3955,
"step": 1543
},
{
"epoch": 1.6631956912028727,
"grad_norm": 0.3418346047401428,
"learning_rate": 4.921620160240107e-06,
"loss": 0.4003,
"step": 1544
},
{
"epoch": 1.6642728904847397,
"grad_norm": 0.3368319869041443,
"learning_rate": 4.915350350012714e-06,
"loss": 0.3984,
"step": 1545
},
{
"epoch": 1.6653500897666067,
"grad_norm": 0.32968056201934814,
"learning_rate": 4.909080672925581e-06,
"loss": 0.3682,
"step": 1546
},
{
"epoch": 1.666427289048474,
"grad_norm": 0.3352184593677521,
"learning_rate": 4.902811138839897e-06,
"loss": 0.416,
"step": 1547
},
{
"epoch": 1.6675044883303412,
"grad_norm": 0.33194318413734436,
"learning_rate": 4.896541757616632e-06,
"loss": 0.4123,
"step": 1548
},
{
"epoch": 1.6685816876122082,
"grad_norm": 0.33617234230041504,
"learning_rate": 4.890272539116508e-06,
"loss": 0.4266,
"step": 1549
},
{
"epoch": 1.6696588868940754,
"grad_norm": 0.3273670971393585,
"learning_rate": 4.884003493199997e-06,
"loss": 0.3765,
"step": 1550
},
{
"epoch": 1.6707360861759426,
"grad_norm": 0.32140910625457764,
"learning_rate": 4.877734629727299e-06,
"loss": 0.3841,
"step": 1551
},
{
"epoch": 1.6718132854578096,
"grad_norm": 0.3900831937789917,
"learning_rate": 4.8714659585583246e-06,
"loss": 0.4438,
"step": 1552
},
{
"epoch": 1.6728904847396768,
"grad_norm": 0.3028438091278076,
"learning_rate": 4.865197489552684e-06,
"loss": 0.383,
"step": 1553
},
{
"epoch": 1.673967684021544,
"grad_norm": 0.3399808704853058,
"learning_rate": 4.858929232569671e-06,
"loss": 0.4089,
"step": 1554
},
{
"epoch": 1.675044883303411,
"grad_norm": 0.3515014350414276,
"learning_rate": 4.85266119746824e-06,
"loss": 0.3845,
"step": 1555
},
{
"epoch": 1.6761220825852783,
"grad_norm": 0.3478013873100281,
"learning_rate": 4.846393394107001e-06,
"loss": 0.405,
"step": 1556
},
{
"epoch": 1.6771992818671455,
"grad_norm": 0.3799297511577606,
"learning_rate": 4.840125832344202e-06,
"loss": 0.4192,
"step": 1557
},
{
"epoch": 1.6782764811490125,
"grad_norm": 0.340212881565094,
"learning_rate": 4.8338585220377045e-06,
"loss": 0.3992,
"step": 1558
},
{
"epoch": 1.6793536804308797,
"grad_norm": 0.34044545888900757,
"learning_rate": 4.827591473044978e-06,
"loss": 0.4019,
"step": 1559
},
{
"epoch": 1.680430879712747,
"grad_norm": 0.3173620104789734,
"learning_rate": 4.82132469522308e-06,
"loss": 0.3811,
"step": 1560
},
{
"epoch": 1.681508078994614,
"grad_norm": 0.30734777450561523,
"learning_rate": 4.815058198428643e-06,
"loss": 0.378,
"step": 1561
},
{
"epoch": 1.6825852782764812,
"grad_norm": 0.29187920689582825,
"learning_rate": 4.808791992517857e-06,
"loss": 0.3649,
"step": 1562
},
{
"epoch": 1.6836624775583484,
"grad_norm": 0.32691147923469543,
"learning_rate": 4.802526087346453e-06,
"loss": 0.4017,
"step": 1563
},
{
"epoch": 1.6847396768402154,
"grad_norm": 0.3336152136325836,
"learning_rate": 4.796260492769691e-06,
"loss": 0.437,
"step": 1564
},
{
"epoch": 1.6858168761220824,
"grad_norm": 0.32136350870132446,
"learning_rate": 4.789995218642341e-06,
"loss": 0.4128,
"step": 1565
},
{
"epoch": 1.6868940754039499,
"grad_norm": 0.32937243580818176,
"learning_rate": 4.783730274818671e-06,
"loss": 0.4215,
"step": 1566
},
{
"epoch": 1.6879712746858169,
"grad_norm": 0.3161608576774597,
"learning_rate": 4.777465671152426e-06,
"loss": 0.3936,
"step": 1567
},
{
"epoch": 1.6890484739676839,
"grad_norm": 0.3174351155757904,
"learning_rate": 4.771201417496819e-06,
"loss": 0.4123,
"step": 1568
},
{
"epoch": 1.6901256732495513,
"grad_norm": 0.3189868927001953,
"learning_rate": 4.7649375237045135e-06,
"loss": 0.3904,
"step": 1569
},
{
"epoch": 1.6912028725314183,
"grad_norm": 0.28452205657958984,
"learning_rate": 4.7586739996276015e-06,
"loss": 0.3672,
"step": 1570
},
{
"epoch": 1.6922800718132853,
"grad_norm": 0.3064326047897339,
"learning_rate": 4.752410855117599e-06,
"loss": 0.3998,
"step": 1571
},
{
"epoch": 1.6933572710951525,
"grad_norm": 0.3178927004337311,
"learning_rate": 4.746148100025422e-06,
"loss": 0.4376,
"step": 1572
},
{
"epoch": 1.6944344703770198,
"grad_norm": 0.3099832236766815,
"learning_rate": 4.7398857442013746e-06,
"loss": 0.368,
"step": 1573
},
{
"epoch": 1.6955116696588868,
"grad_norm": 0.3031095266342163,
"learning_rate": 4.733623797495136e-06,
"loss": 0.3652,
"step": 1574
},
{
"epoch": 1.696588868940754,
"grad_norm": 0.32943809032440186,
"learning_rate": 4.727362269755736e-06,
"loss": 0.4372,
"step": 1575
},
{
"epoch": 1.6976660682226212,
"grad_norm": 0.32587704062461853,
"learning_rate": 4.72110117083155e-06,
"loss": 0.4416,
"step": 1576
},
{
"epoch": 1.6987432675044882,
"grad_norm": 0.31048640608787537,
"learning_rate": 4.714840510570278e-06,
"loss": 0.4197,
"step": 1577
},
{
"epoch": 1.6998204667863555,
"grad_norm": 0.27757468819618225,
"learning_rate": 4.708580298818931e-06,
"loss": 0.3919,
"step": 1578
},
{
"epoch": 1.7008976660682227,
"grad_norm": 0.3723500370979309,
"learning_rate": 4.702320545423814e-06,
"loss": 0.3957,
"step": 1579
},
{
"epoch": 1.7019748653500897,
"grad_norm": 0.3020727336406708,
"learning_rate": 4.696061260230504e-06,
"loss": 0.3796,
"step": 1580
},
{
"epoch": 1.703052064631957,
"grad_norm": 0.3209006190299988,
"learning_rate": 4.689802453083854e-06,
"loss": 0.4248,
"step": 1581
},
{
"epoch": 1.7041292639138241,
"grad_norm": 0.307170033454895,
"learning_rate": 4.683544133827955e-06,
"loss": 0.3971,
"step": 1582
},
{
"epoch": 1.7052064631956911,
"grad_norm": 0.3253774344921112,
"learning_rate": 4.677286312306139e-06,
"loss": 0.3774,
"step": 1583
},
{
"epoch": 1.7062836624775584,
"grad_norm": 0.345463365316391,
"learning_rate": 4.671028998360947e-06,
"loss": 0.3968,
"step": 1584
},
{
"epoch": 1.7073608617594256,
"grad_norm": 0.3188075125217438,
"learning_rate": 4.664772201834128e-06,
"loss": 0.4299,
"step": 1585
},
{
"epoch": 1.7084380610412926,
"grad_norm": 0.3115480840206146,
"learning_rate": 4.658515932566614e-06,
"loss": 0.3815,
"step": 1586
},
{
"epoch": 1.7095152603231598,
"grad_norm": 0.3210900127887726,
"learning_rate": 4.652260200398507e-06,
"loss": 0.4019,
"step": 1587
},
{
"epoch": 1.710592459605027,
"grad_norm": 0.33627891540527344,
"learning_rate": 4.646005015169067e-06,
"loss": 0.4139,
"step": 1588
},
{
"epoch": 1.711669658886894,
"grad_norm": 0.3372555375099182,
"learning_rate": 4.639750386716693e-06,
"loss": 0.415,
"step": 1589
},
{
"epoch": 1.7127468581687613,
"grad_norm": 0.28700023889541626,
"learning_rate": 4.633496324878906e-06,
"loss": 0.3562,
"step": 1590
},
{
"epoch": 1.7138240574506285,
"grad_norm": 0.3731818199157715,
"learning_rate": 4.627242839492339e-06,
"loss": 0.4113,
"step": 1591
},
{
"epoch": 1.7149012567324955,
"grad_norm": 0.29507315158843994,
"learning_rate": 4.620989940392715e-06,
"loss": 0.3995,
"step": 1592
},
{
"epoch": 1.7159784560143625,
"grad_norm": 0.32059916853904724,
"learning_rate": 4.614737637414836e-06,
"loss": 0.3934,
"step": 1593
},
{
"epoch": 1.71705565529623,
"grad_norm": 0.29595065116882324,
"learning_rate": 4.60848594039257e-06,
"loss": 0.3558,
"step": 1594
},
{
"epoch": 1.718132854578097,
"grad_norm": 0.3473470211029053,
"learning_rate": 4.602234859158827e-06,
"loss": 0.4578,
"step": 1595
},
{
"epoch": 1.719210053859964,
"grad_norm": 0.3321559727191925,
"learning_rate": 4.59598440354555e-06,
"loss": 0.3863,
"step": 1596
},
{
"epoch": 1.7202872531418314,
"grad_norm": 0.3324938714504242,
"learning_rate": 4.5897345833837e-06,
"loss": 0.4062,
"step": 1597
},
{
"epoch": 1.7213644524236984,
"grad_norm": 0.29092180728912354,
"learning_rate": 4.583485408503237e-06,
"loss": 0.3987,
"step": 1598
},
{
"epoch": 1.7224416517055654,
"grad_norm": 0.2973518967628479,
"learning_rate": 4.5772368887331044e-06,
"loss": 0.383,
"step": 1599
},
{
"epoch": 1.7235188509874326,
"grad_norm": 0.3359359800815582,
"learning_rate": 4.5709890339012205e-06,
"loss": 0.3992,
"step": 1600
},
{
"epoch": 1.7245960502692999,
"grad_norm": 0.396452397108078,
"learning_rate": 4.564741853834448e-06,
"loss": 0.4233,
"step": 1601
},
{
"epoch": 1.7256732495511669,
"grad_norm": 0.3256533145904541,
"learning_rate": 4.558495358358599e-06,
"loss": 0.4415,
"step": 1602
},
{
"epoch": 1.726750448833034,
"grad_norm": 0.37291306257247925,
"learning_rate": 4.552249557298403e-06,
"loss": 0.3855,
"step": 1603
},
{
"epoch": 1.7278276481149013,
"grad_norm": 0.3769271969795227,
"learning_rate": 4.5460044604774986e-06,
"loss": 0.4087,
"step": 1604
},
{
"epoch": 1.7289048473967683,
"grad_norm": 0.35868149995803833,
"learning_rate": 4.539760077718416e-06,
"loss": 0.3917,
"step": 1605
},
{
"epoch": 1.7299820466786355,
"grad_norm": 0.3206818699836731,
"learning_rate": 4.533516418842565e-06,
"loss": 0.4174,
"step": 1606
},
{
"epoch": 1.7310592459605028,
"grad_norm": 0.3528873920440674,
"learning_rate": 4.5272734936702116e-06,
"loss": 0.405,
"step": 1607
},
{
"epoch": 1.7321364452423698,
"grad_norm": 0.4061715006828308,
"learning_rate": 4.521031312020473e-06,
"loss": 0.4191,
"step": 1608
},
{
"epoch": 1.733213644524237,
"grad_norm": 0.3145693838596344,
"learning_rate": 4.514789883711296e-06,
"loss": 0.4227,
"step": 1609
},
{
"epoch": 1.7342908438061042,
"grad_norm": 0.28356069326400757,
"learning_rate": 4.508549218559441e-06,
"loss": 0.372,
"step": 1610
},
{
"epoch": 1.7353680430879712,
"grad_norm": 0.3727303743362427,
"learning_rate": 4.5023093263804705e-06,
"loss": 0.4493,
"step": 1611
},
{
"epoch": 1.7364452423698384,
"grad_norm": 0.33184897899627686,
"learning_rate": 4.496070216988723e-06,
"loss": 0.3889,
"step": 1612
},
{
"epoch": 1.7375224416517057,
"grad_norm": 0.32492300868034363,
"learning_rate": 4.489831900197317e-06,
"loss": 0.3952,
"step": 1613
},
{
"epoch": 1.7385996409335727,
"grad_norm": 0.3340592384338379,
"learning_rate": 4.483594385818119e-06,
"loss": 0.3764,
"step": 1614
},
{
"epoch": 1.73967684021544,
"grad_norm": 0.2960026264190674,
"learning_rate": 4.477357683661734e-06,
"loss": 0.3841,
"step": 1615
},
{
"epoch": 1.7407540394973071,
"grad_norm": 0.33512938022613525,
"learning_rate": 4.471121803537488e-06,
"loss": 0.3839,
"step": 1616
},
{
"epoch": 1.7418312387791741,
"grad_norm": 0.3327915370464325,
"learning_rate": 4.464886755253417e-06,
"loss": 0.4129,
"step": 1617
},
{
"epoch": 1.7429084380610413,
"grad_norm": 0.3029802143573761,
"learning_rate": 4.4586525486162465e-06,
"loss": 0.3737,
"step": 1618
},
{
"epoch": 1.7439856373429086,
"grad_norm": 0.2849656641483307,
"learning_rate": 4.452419193431379e-06,
"loss": 0.4006,
"step": 1619
},
{
"epoch": 1.7450628366247756,
"grad_norm": 0.3106905519962311,
"learning_rate": 4.4461866995028776e-06,
"loss": 0.3991,
"step": 1620
},
{
"epoch": 1.7461400359066426,
"grad_norm": 0.33045652508735657,
"learning_rate": 4.439955076633454e-06,
"loss": 0.3969,
"step": 1621
},
{
"epoch": 1.74721723518851,
"grad_norm": 0.4101937711238861,
"learning_rate": 4.433724334624441e-06,
"loss": 0.4023,
"step": 1622
},
{
"epoch": 1.748294434470377,
"grad_norm": 0.31196269392967224,
"learning_rate": 4.4274944832757964e-06,
"loss": 0.3797,
"step": 1623
},
{
"epoch": 1.749371633752244,
"grad_norm": 0.38965967297554016,
"learning_rate": 4.4212655323860685e-06,
"loss": 0.4432,
"step": 1624
},
{
"epoch": 1.7504488330341115,
"grad_norm": 0.32944896817207336,
"learning_rate": 4.4150374917523955e-06,
"loss": 0.3666,
"step": 1625
},
{
"epoch": 1.7515260323159785,
"grad_norm": 0.31117597222328186,
"learning_rate": 4.408810371170484e-06,
"loss": 0.3885,
"step": 1626
},
{
"epoch": 1.7526032315978455,
"grad_norm": 0.3230034112930298,
"learning_rate": 4.402584180434586e-06,
"loss": 0.3943,
"step": 1627
},
{
"epoch": 1.7536804308797127,
"grad_norm": 0.3342301845550537,
"learning_rate": 4.396358929337499e-06,
"loss": 0.3917,
"step": 1628
},
{
"epoch": 1.75475763016158,
"grad_norm": 0.34093886613845825,
"learning_rate": 4.39013462767054e-06,
"loss": 0.3816,
"step": 1629
},
{
"epoch": 1.755834829443447,
"grad_norm": 0.28704625368118286,
"learning_rate": 4.3839112852235335e-06,
"loss": 0.4086,
"step": 1630
},
{
"epoch": 1.7569120287253142,
"grad_norm": 0.3321066200733185,
"learning_rate": 4.377688911784792e-06,
"loss": 0.4045,
"step": 1631
},
{
"epoch": 1.7579892280071814,
"grad_norm": 0.314985454082489,
"learning_rate": 4.371467517141108e-06,
"loss": 0.4045,
"step": 1632
},
{
"epoch": 1.7590664272890484,
"grad_norm": 0.28622984886169434,
"learning_rate": 4.365247111077731e-06,
"loss": 0.3668,
"step": 1633
},
{
"epoch": 1.7601436265709156,
"grad_norm": 0.33727702498435974,
"learning_rate": 4.359027703378357e-06,
"loss": 0.4285,
"step": 1634
},
{
"epoch": 1.7612208258527828,
"grad_norm": 0.29879483580589294,
"learning_rate": 4.352809303825115e-06,
"loss": 0.3871,
"step": 1635
},
{
"epoch": 1.7622980251346498,
"grad_norm": 0.3572829067707062,
"learning_rate": 4.346591922198542e-06,
"loss": 0.4131,
"step": 1636
},
{
"epoch": 1.763375224416517,
"grad_norm": 0.3054465651512146,
"learning_rate": 4.34037556827758e-06,
"loss": 0.4006,
"step": 1637
},
{
"epoch": 1.7644524236983843,
"grad_norm": 0.3047214448451996,
"learning_rate": 4.334160251839552e-06,
"loss": 0.408,
"step": 1638
},
{
"epoch": 1.7655296229802513,
"grad_norm": 0.3058657646179199,
"learning_rate": 4.327945982660146e-06,
"loss": 0.3935,
"step": 1639
},
{
"epoch": 1.7666068222621185,
"grad_norm": 0.3060075342655182,
"learning_rate": 4.321732770513408e-06,
"loss": 0.4331,
"step": 1640
},
{
"epoch": 1.7676840215439857,
"grad_norm": 0.31800925731658936,
"learning_rate": 4.31552062517172e-06,
"loss": 0.4002,
"step": 1641
},
{
"epoch": 1.7687612208258527,
"grad_norm": 0.30724993348121643,
"learning_rate": 4.309309556405786e-06,
"loss": 0.4331,
"step": 1642
},
{
"epoch": 1.76983842010772,
"grad_norm": 0.2819603681564331,
"learning_rate": 4.303099573984617e-06,
"loss": 0.3929,
"step": 1643
},
{
"epoch": 1.7709156193895872,
"grad_norm": 0.3609313368797302,
"learning_rate": 4.29689068767551e-06,
"loss": 0.4467,
"step": 1644
},
{
"epoch": 1.7719928186714542,
"grad_norm": 0.2898065745830536,
"learning_rate": 4.290682907244046e-06,
"loss": 0.3949,
"step": 1645
},
{
"epoch": 1.7730700179533212,
"grad_norm": 0.29993054270744324,
"learning_rate": 4.284476242454062e-06,
"loss": 0.405,
"step": 1646
},
{
"epoch": 1.7741472172351886,
"grad_norm": 0.36077243089675903,
"learning_rate": 4.278270703067644e-06,
"loss": 0.422,
"step": 1647
},
{
"epoch": 1.7752244165170556,
"grad_norm": 0.3274693787097931,
"learning_rate": 4.272066298845102e-06,
"loss": 0.3884,
"step": 1648
},
{
"epoch": 1.7763016157989227,
"grad_norm": 0.29320403933525085,
"learning_rate": 4.265863039544967e-06,
"loss": 0.3889,
"step": 1649
},
{
"epoch": 1.77737881508079,
"grad_norm": 0.31313973665237427,
"learning_rate": 4.259660934923965e-06,
"loss": 0.4039,
"step": 1650
},
{
"epoch": 1.778456014362657,
"grad_norm": 0.30798840522766113,
"learning_rate": 4.253459994737006e-06,
"loss": 0.392,
"step": 1651
},
{
"epoch": 1.779533213644524,
"grad_norm": 0.32937532663345337,
"learning_rate": 4.247260228737171e-06,
"loss": 0.4135,
"step": 1652
},
{
"epoch": 1.7806104129263913,
"grad_norm": 0.31155073642730713,
"learning_rate": 4.241061646675695e-06,
"loss": 0.3711,
"step": 1653
},
{
"epoch": 1.7816876122082586,
"grad_norm": 0.3357556164264679,
"learning_rate": 4.234864258301943e-06,
"loss": 0.4044,
"step": 1654
},
{
"epoch": 1.7827648114901256,
"grad_norm": 0.3224906921386719,
"learning_rate": 4.228668073363413e-06,
"loss": 0.4244,
"step": 1655
},
{
"epoch": 1.7838420107719928,
"grad_norm": 0.3008199632167816,
"learning_rate": 4.222473101605703e-06,
"loss": 0.3974,
"step": 1656
},
{
"epoch": 1.78491921005386,
"grad_norm": 0.32574763894081116,
"learning_rate": 4.216279352772506e-06,
"loss": 0.4055,
"step": 1657
},
{
"epoch": 1.785996409335727,
"grad_norm": 0.2971203327178955,
"learning_rate": 4.210086836605592e-06,
"loss": 0.396,
"step": 1658
},
{
"epoch": 1.7870736086175942,
"grad_norm": 0.31350067257881165,
"learning_rate": 4.203895562844789e-06,
"loss": 0.3938,
"step": 1659
},
{
"epoch": 1.7881508078994615,
"grad_norm": 0.33718141913414,
"learning_rate": 4.197705541227973e-06,
"loss": 0.409,
"step": 1660
},
{
"epoch": 1.7892280071813285,
"grad_norm": 0.3132805824279785,
"learning_rate": 4.1915167814910515e-06,
"loss": 0.4023,
"step": 1661
},
{
"epoch": 1.7903052064631957,
"grad_norm": 0.3028797209262848,
"learning_rate": 4.1853292933679476e-06,
"loss": 0.4175,
"step": 1662
},
{
"epoch": 1.791382405745063,
"grad_norm": 0.31412598490715027,
"learning_rate": 4.1791430865905795e-06,
"loss": 0.3823,
"step": 1663
},
{
"epoch": 1.79245960502693,
"grad_norm": 0.298460453748703,
"learning_rate": 4.172958170888858e-06,
"loss": 0.3611,
"step": 1664
},
{
"epoch": 1.7935368043087971,
"grad_norm": 0.3055499792098999,
"learning_rate": 4.166774555990654e-06,
"loss": 0.4062,
"step": 1665
},
{
"epoch": 1.7946140035906644,
"grad_norm": 0.2997424900531769,
"learning_rate": 4.1605922516218e-06,
"loss": 0.4274,
"step": 1666
},
{
"epoch": 1.7956912028725314,
"grad_norm": 0.30274149775505066,
"learning_rate": 4.154411267506065e-06,
"loss": 0.397,
"step": 1667
},
{
"epoch": 1.7967684021543986,
"grad_norm": 0.36079904437065125,
"learning_rate": 4.148231613365138e-06,
"loss": 0.3914,
"step": 1668
},
{
"epoch": 1.7978456014362658,
"grad_norm": 0.32576999068260193,
"learning_rate": 4.142053298918622e-06,
"loss": 0.3782,
"step": 1669
},
{
"epoch": 1.7989228007181328,
"grad_norm": 0.3388504087924957,
"learning_rate": 4.135876333884009e-06,
"loss": 0.4293,
"step": 1670
},
{
"epoch": 1.8,
"grad_norm": 0.3610943853855133,
"learning_rate": 4.129700727976671e-06,
"loss": 0.4118,
"step": 1671
},
{
"epoch": 1.8010771992818673,
"grad_norm": 0.3224228024482727,
"learning_rate": 4.123526490909837e-06,
"loss": 0.4185,
"step": 1672
},
{
"epoch": 1.8021543985637343,
"grad_norm": 0.2802104353904724,
"learning_rate": 4.11735363239459e-06,
"loss": 0.36,
"step": 1673
},
{
"epoch": 1.8032315978456013,
"grad_norm": 0.32841676473617554,
"learning_rate": 4.111182162139844e-06,
"loss": 0.3933,
"step": 1674
},
{
"epoch": 1.8043087971274687,
"grad_norm": 0.33936789631843567,
"learning_rate": 4.105012089852324e-06,
"loss": 0.3789,
"step": 1675
},
{
"epoch": 1.8053859964093357,
"grad_norm": 0.3322623670101166,
"learning_rate": 4.098843425236558e-06,
"loss": 0.4065,
"step": 1676
},
{
"epoch": 1.8064631956912027,
"grad_norm": 0.32670193910598755,
"learning_rate": 4.092676177994862e-06,
"loss": 0.4154,
"step": 1677
},
{
"epoch": 1.8075403949730702,
"grad_norm": 0.30702006816864014,
"learning_rate": 4.086510357827324e-06,
"loss": 0.4061,
"step": 1678
},
{
"epoch": 1.8086175942549372,
"grad_norm": 0.33039966225624084,
"learning_rate": 4.080345974431786e-06,
"loss": 0.415,
"step": 1679
},
{
"epoch": 1.8096947935368042,
"grad_norm": 0.30007797479629517,
"learning_rate": 4.074183037503827e-06,
"loss": 0.3897,
"step": 1680
},
{
"epoch": 1.8107719928186714,
"grad_norm": 0.3224261403083801,
"learning_rate": 4.068021556736755e-06,
"loss": 0.418,
"step": 1681
},
{
"epoch": 1.8118491921005386,
"grad_norm": 0.32132020592689514,
"learning_rate": 4.061861541821587e-06,
"loss": 0.4053,
"step": 1682
},
{
"epoch": 1.8129263913824056,
"grad_norm": 0.2934954762458801,
"learning_rate": 4.055703002447033e-06,
"loss": 0.3839,
"step": 1683
},
{
"epoch": 1.8140035906642729,
"grad_norm": 0.32513371109962463,
"learning_rate": 4.049545948299482e-06,
"loss": 0.4202,
"step": 1684
},
{
"epoch": 1.81508078994614,
"grad_norm": 0.3289554715156555,
"learning_rate": 4.043390389062993e-06,
"loss": 0.4068,
"step": 1685
},
{
"epoch": 1.816157989228007,
"grad_norm": 0.31976351141929626,
"learning_rate": 4.037236334419261e-06,
"loss": 0.437,
"step": 1686
},
{
"epoch": 1.8172351885098743,
"grad_norm": 0.2936280369758606,
"learning_rate": 4.0310837940476275e-06,
"loss": 0.3808,
"step": 1687
},
{
"epoch": 1.8183123877917415,
"grad_norm": 0.2943546175956726,
"learning_rate": 4.024932777625044e-06,
"loss": 0.403,
"step": 1688
},
{
"epoch": 1.8193895870736085,
"grad_norm": 0.3043786585330963,
"learning_rate": 4.018783294826071e-06,
"loss": 0.4002,
"step": 1689
},
{
"epoch": 1.8204667863554758,
"grad_norm": 0.32939180731773376,
"learning_rate": 4.0126353553228525e-06,
"loss": 0.3868,
"step": 1690
},
{
"epoch": 1.821543985637343,
"grad_norm": 0.30302363634109497,
"learning_rate": 4.006488968785106e-06,
"loss": 0.3876,
"step": 1691
},
{
"epoch": 1.82262118491921,
"grad_norm": 0.3275144398212433,
"learning_rate": 4.000344144880108e-06,
"loss": 0.4053,
"step": 1692
},
{
"epoch": 1.8236983842010772,
"grad_norm": 0.2963061034679413,
"learning_rate": 3.994200893272676e-06,
"loss": 0.3678,
"step": 1693
},
{
"epoch": 1.8247755834829444,
"grad_norm": 0.33487895131111145,
"learning_rate": 3.988059223625155e-06,
"loss": 0.402,
"step": 1694
},
{
"epoch": 1.8258527827648114,
"grad_norm": 0.31342634558677673,
"learning_rate": 3.981919145597404e-06,
"loss": 0.4244,
"step": 1695
},
{
"epoch": 1.8269299820466787,
"grad_norm": 0.30831387639045715,
"learning_rate": 3.97578066884677e-06,
"loss": 0.4158,
"step": 1696
},
{
"epoch": 1.828007181328546,
"grad_norm": 0.2899012863636017,
"learning_rate": 3.9696438030280925e-06,
"loss": 0.3551,
"step": 1697
},
{
"epoch": 1.829084380610413,
"grad_norm": 0.3651769459247589,
"learning_rate": 3.9635085577936706e-06,
"loss": 0.4014,
"step": 1698
},
{
"epoch": 1.8301615798922801,
"grad_norm": 0.3244316875934601,
"learning_rate": 3.957374942793259e-06,
"loss": 0.388,
"step": 1699
},
{
"epoch": 1.8312387791741473,
"grad_norm": 0.3279629349708557,
"learning_rate": 3.951242967674042e-06,
"loss": 0.4615,
"step": 1700
},
{
"epoch": 1.8323159784560143,
"grad_norm": 0.2855011224746704,
"learning_rate": 3.94511264208063e-06,
"loss": 0.35,
"step": 1701
},
{
"epoch": 1.8333931777378814,
"grad_norm": 0.3636470437049866,
"learning_rate": 3.938983975655039e-06,
"loss": 0.4525,
"step": 1702
},
{
"epoch": 1.8344703770197488,
"grad_norm": 0.35178375244140625,
"learning_rate": 3.932856978036671e-06,
"loss": 0.404,
"step": 1703
},
{
"epoch": 1.8355475763016158,
"grad_norm": 0.28978657722473145,
"learning_rate": 3.926731658862307e-06,
"loss": 0.379,
"step": 1704
},
{
"epoch": 1.8366247755834828,
"grad_norm": 0.28477418422698975,
"learning_rate": 3.920608027766086e-06,
"loss": 0.3963,
"step": 1705
},
{
"epoch": 1.8377019748653503,
"grad_norm": 0.39070865511894226,
"learning_rate": 3.914486094379497e-06,
"loss": 0.4023,
"step": 1706
},
{
"epoch": 1.8387791741472173,
"grad_norm": 0.34812501072883606,
"learning_rate": 3.90836586833135e-06,
"loss": 0.3978,
"step": 1707
},
{
"epoch": 1.8398563734290843,
"grad_norm": 0.2860909104347229,
"learning_rate": 3.902247359247775e-06,
"loss": 0.3905,
"step": 1708
},
{
"epoch": 1.8409335727109515,
"grad_norm": 0.3125012218952179,
"learning_rate": 3.8961305767522015e-06,
"loss": 0.4186,
"step": 1709
},
{
"epoch": 1.8420107719928187,
"grad_norm": 0.37062105536460876,
"learning_rate": 3.890015530465342e-06,
"loss": 0.437,
"step": 1710
},
{
"epoch": 1.8430879712746857,
"grad_norm": 0.3444374203681946,
"learning_rate": 3.883902230005181e-06,
"loss": 0.3863,
"step": 1711
},
{
"epoch": 1.844165170556553,
"grad_norm": 0.3282957077026367,
"learning_rate": 3.877790684986953e-06,
"loss": 0.4154,
"step": 1712
},
{
"epoch": 1.8452423698384202,
"grad_norm": 0.31650158762931824,
"learning_rate": 3.871680905023133e-06,
"loss": 0.3834,
"step": 1713
},
{
"epoch": 1.8463195691202872,
"grad_norm": 0.3248016834259033,
"learning_rate": 3.865572899723423e-06,
"loss": 0.4203,
"step": 1714
},
{
"epoch": 1.8473967684021544,
"grad_norm": 0.36757394671440125,
"learning_rate": 3.859466678694728e-06,
"loss": 0.4197,
"step": 1715
},
{
"epoch": 1.8484739676840216,
"grad_norm": 0.314170241355896,
"learning_rate": 3.853362251541153e-06,
"loss": 0.371,
"step": 1716
},
{
"epoch": 1.8495511669658886,
"grad_norm": 0.3025366961956024,
"learning_rate": 3.847259627863974e-06,
"loss": 0.3935,
"step": 1717
},
{
"epoch": 1.8506283662477558,
"grad_norm": 0.3148907423019409,
"learning_rate": 3.841158817261637e-06,
"loss": 0.3956,
"step": 1718
},
{
"epoch": 1.851705565529623,
"grad_norm": 0.3267936110496521,
"learning_rate": 3.8350598293297345e-06,
"loss": 0.4408,
"step": 1719
},
{
"epoch": 1.85278276481149,
"grad_norm": 0.3058549761772156,
"learning_rate": 3.82896267366099e-06,
"loss": 0.3488,
"step": 1720
},
{
"epoch": 1.8538599640933573,
"grad_norm": 0.3332759439945221,
"learning_rate": 3.822867359845248e-06,
"loss": 0.4289,
"step": 1721
},
{
"epoch": 1.8549371633752245,
"grad_norm": 0.31015682220458984,
"learning_rate": 3.816773897469454e-06,
"loss": 0.411,
"step": 1722
},
{
"epoch": 1.8560143626570915,
"grad_norm": 0.3165980279445648,
"learning_rate": 3.8106822961176443e-06,
"loss": 0.4186,
"step": 1723
},
{
"epoch": 1.8570915619389587,
"grad_norm": 0.3124646544456482,
"learning_rate": 3.8045925653709238e-06,
"loss": 0.3901,
"step": 1724
},
{
"epoch": 1.858168761220826,
"grad_norm": 0.30677077174186707,
"learning_rate": 3.7985047148074584e-06,
"loss": 0.4139,
"step": 1725
},
{
"epoch": 1.859245960502693,
"grad_norm": 0.2697823643684387,
"learning_rate": 3.792418754002457e-06,
"loss": 0.3514,
"step": 1726
},
{
"epoch": 1.8603231597845602,
"grad_norm": 0.32413095235824585,
"learning_rate": 3.7863346925281565e-06,
"loss": 0.4122,
"step": 1727
},
{
"epoch": 1.8614003590664274,
"grad_norm": 0.2940352261066437,
"learning_rate": 3.7802525399537997e-06,
"loss": 0.4128,
"step": 1728
},
{
"epoch": 1.8624775583482944,
"grad_norm": 0.32525020837783813,
"learning_rate": 3.774172305845636e-06,
"loss": 0.4161,
"step": 1729
},
{
"epoch": 1.8635547576301614,
"grad_norm": 0.30089908838272095,
"learning_rate": 3.7680939997668942e-06,
"loss": 0.402,
"step": 1730
},
{
"epoch": 1.8646319569120289,
"grad_norm": 0.28563210368156433,
"learning_rate": 3.76201763127777e-06,
"loss": 0.3704,
"step": 1731
},
{
"epoch": 1.8657091561938959,
"grad_norm": 0.3135564923286438,
"learning_rate": 3.755943209935411e-06,
"loss": 0.4636,
"step": 1732
},
{
"epoch": 1.8667863554757629,
"grad_norm": 0.28799933195114136,
"learning_rate": 3.749870745293903e-06,
"loss": 0.3825,
"step": 1733
},
{
"epoch": 1.86786355475763,
"grad_norm": 0.28616318106651306,
"learning_rate": 3.7438002469042567e-06,
"loss": 0.3783,
"step": 1734
},
{
"epoch": 1.8689407540394973,
"grad_norm": 0.31118789315223694,
"learning_rate": 3.737731724314384e-06,
"loss": 0.4277,
"step": 1735
},
{
"epoch": 1.8700179533213643,
"grad_norm": 0.30302414298057556,
"learning_rate": 3.7316651870690957e-06,
"loss": 0.3949,
"step": 1736
},
{
"epoch": 1.8710951526032316,
"grad_norm": 0.30837923288345337,
"learning_rate": 3.725600644710078e-06,
"loss": 0.3731,
"step": 1737
},
{
"epoch": 1.8721723518850988,
"grad_norm": 0.3579249083995819,
"learning_rate": 3.7195381067758755e-06,
"loss": 0.412,
"step": 1738
},
{
"epoch": 1.8732495511669658,
"grad_norm": 0.29725703597068787,
"learning_rate": 3.7134775828018864e-06,
"loss": 0.3809,
"step": 1739
},
{
"epoch": 1.874326750448833,
"grad_norm": 0.3607236444950104,
"learning_rate": 3.707419082320336e-06,
"loss": 0.4317,
"step": 1740
},
{
"epoch": 1.8754039497307002,
"grad_norm": 0.3588671386241913,
"learning_rate": 3.70136261486027e-06,
"loss": 0.3785,
"step": 1741
},
{
"epoch": 1.8764811490125672,
"grad_norm": 0.26367682218551636,
"learning_rate": 3.6953081899475363e-06,
"loss": 0.3452,
"step": 1742
},
{
"epoch": 1.8775583482944345,
"grad_norm": 0.3392777740955353,
"learning_rate": 3.68925581710477e-06,
"loss": 0.3989,
"step": 1743
},
{
"epoch": 1.8786355475763017,
"grad_norm": 0.3400457203388214,
"learning_rate": 3.683205505851377e-06,
"loss": 0.4203,
"step": 1744
},
{
"epoch": 1.8797127468581687,
"grad_norm": 0.3056473433971405,
"learning_rate": 3.6771572657035214e-06,
"loss": 0.3779,
"step": 1745
},
{
"epoch": 1.880789946140036,
"grad_norm": 0.31150609254837036,
"learning_rate": 3.6711111061741133e-06,
"loss": 0.4088,
"step": 1746
},
{
"epoch": 1.8818671454219031,
"grad_norm": 0.31468626856803894,
"learning_rate": 3.6650670367727843e-06,
"loss": 0.4061,
"step": 1747
},
{
"epoch": 1.8829443447037701,
"grad_norm": 0.3258722126483917,
"learning_rate": 3.6590250670058848e-06,
"loss": 0.3803,
"step": 1748
},
{
"epoch": 1.8840215439856374,
"grad_norm": 0.3498111367225647,
"learning_rate": 3.652985206376455e-06,
"loss": 0.414,
"step": 1749
},
{
"epoch": 1.8850987432675046,
"grad_norm": 0.3353995084762573,
"learning_rate": 3.646947464384224e-06,
"loss": 0.4106,
"step": 1750
},
{
"epoch": 1.8861759425493716,
"grad_norm": 0.34021976590156555,
"learning_rate": 3.6409118505255896e-06,
"loss": 0.4082,
"step": 1751
},
{
"epoch": 1.8872531418312388,
"grad_norm": 0.30451297760009766,
"learning_rate": 3.6348783742935967e-06,
"loss": 0.3847,
"step": 1752
},
{
"epoch": 1.888330341113106,
"grad_norm": 0.2944905757904053,
"learning_rate": 3.628847045177932e-06,
"loss": 0.3673,
"step": 1753
},
{
"epoch": 1.889407540394973,
"grad_norm": 0.2865719199180603,
"learning_rate": 3.622817872664905e-06,
"loss": 0.394,
"step": 1754
},
{
"epoch": 1.89048473967684,
"grad_norm": 0.33090847730636597,
"learning_rate": 3.616790866237433e-06,
"loss": 0.3883,
"step": 1755
},
{
"epoch": 1.8915619389587075,
"grad_norm": 0.32876595854759216,
"learning_rate": 3.610766035375023e-06,
"loss": 0.3951,
"step": 1756
},
{
"epoch": 1.8926391382405745,
"grad_norm": 0.3173961639404297,
"learning_rate": 3.6047433895537657e-06,
"loss": 0.4043,
"step": 1757
},
{
"epoch": 1.8937163375224415,
"grad_norm": 0.3128277659416199,
"learning_rate": 3.598722938246314e-06,
"loss": 0.4195,
"step": 1758
},
{
"epoch": 1.894793536804309,
"grad_norm": 0.3198891282081604,
"learning_rate": 3.5927046909218634e-06,
"loss": 0.396,
"step": 1759
},
{
"epoch": 1.895870736086176,
"grad_norm": 0.3302939832210541,
"learning_rate": 3.5866886570461486e-06,
"loss": 0.4144,
"step": 1760
},
{
"epoch": 1.896947935368043,
"grad_norm": 0.326718270778656,
"learning_rate": 3.580674846081421e-06,
"loss": 0.3844,
"step": 1761
},
{
"epoch": 1.8980251346499102,
"grad_norm": 0.30662721395492554,
"learning_rate": 3.574663267486438e-06,
"loss": 0.3882,
"step": 1762
},
{
"epoch": 1.8991023339317774,
"grad_norm": 0.3321872651576996,
"learning_rate": 3.5686539307164427e-06,
"loss": 0.4391,
"step": 1763
},
{
"epoch": 1.9001795332136444,
"grad_norm": 0.32872331142425537,
"learning_rate": 3.5626468452231534e-06,
"loss": 0.3723,
"step": 1764
},
{
"epoch": 1.9012567324955116,
"grad_norm": 0.3216870427131653,
"learning_rate": 3.556642020454747e-06,
"loss": 0.4034,
"step": 1765
},
{
"epoch": 1.9023339317773789,
"grad_norm": 0.3641759753227234,
"learning_rate": 3.5506394658558474e-06,
"loss": 0.4219,
"step": 1766
},
{
"epoch": 1.9034111310592459,
"grad_norm": 0.3454039990901947,
"learning_rate": 3.5446391908675036e-06,
"loss": 0.3849,
"step": 1767
},
{
"epoch": 1.904488330341113,
"grad_norm": 0.31155654788017273,
"learning_rate": 3.538641204927181e-06,
"loss": 0.369,
"step": 1768
},
{
"epoch": 1.9055655296229803,
"grad_norm": 0.3384184241294861,
"learning_rate": 3.532645517468748e-06,
"loss": 0.4074,
"step": 1769
},
{
"epoch": 1.9066427289048473,
"grad_norm": 0.3226218521595001,
"learning_rate": 3.5266521379224506e-06,
"loss": 0.3892,
"step": 1770
},
{
"epoch": 1.9077199281867145,
"grad_norm": 0.3048873245716095,
"learning_rate": 3.5206610757149123e-06,
"loss": 0.3762,
"step": 1771
},
{
"epoch": 1.9087971274685818,
"grad_norm": 0.3158170282840729,
"learning_rate": 3.5146723402691054e-06,
"loss": 0.4129,
"step": 1772
},
{
"epoch": 1.9098743267504488,
"grad_norm": 0.2803991436958313,
"learning_rate": 3.508685941004348e-06,
"loss": 0.3642,
"step": 1773
},
{
"epoch": 1.910951526032316,
"grad_norm": 0.28167566657066345,
"learning_rate": 3.50270188733628e-06,
"loss": 0.3804,
"step": 1774
},
{
"epoch": 1.9120287253141832,
"grad_norm": 0.32653680443763733,
"learning_rate": 3.496720188676856e-06,
"loss": 0.4318,
"step": 1775
},
{
"epoch": 1.9131059245960502,
"grad_norm": 0.3046547472476959,
"learning_rate": 3.490740854434321e-06,
"loss": 0.3754,
"step": 1776
},
{
"epoch": 1.9141831238779174,
"grad_norm": 0.305685430765152,
"learning_rate": 3.4847638940132054e-06,
"loss": 0.4053,
"step": 1777
},
{
"epoch": 1.9152603231597847,
"grad_norm": 0.3087405562400818,
"learning_rate": 3.478789316814306e-06,
"loss": 0.3818,
"step": 1778
},
{
"epoch": 1.9163375224416517,
"grad_norm": 0.31966203451156616,
"learning_rate": 3.472817132234669e-06,
"loss": 0.4288,
"step": 1779
},
{
"epoch": 1.917414721723519,
"grad_norm": 0.27336421608924866,
"learning_rate": 3.466847349667578e-06,
"loss": 0.3642,
"step": 1780
},
{
"epoch": 1.9184919210053861,
"grad_norm": 0.32802098989486694,
"learning_rate": 3.460879978502538e-06,
"loss": 0.427,
"step": 1781
},
{
"epoch": 1.9195691202872531,
"grad_norm": 0.314151406288147,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.3953,
"step": 1782
},
{
"epoch": 1.9206463195691201,
"grad_norm": 0.275924950838089,
"learning_rate": 3.4489525079176612e-06,
"loss": 0.3801,
"step": 1783
},
{
"epoch": 1.9217235188509876,
"grad_norm": 0.290600061416626,
"learning_rate": 3.442992427257812e-06,
"loss": 0.4036,
"step": 1784
},
{
"epoch": 1.9228007181328546,
"grad_norm": 0.27883657813072205,
"learning_rate": 3.4370347955199634e-06,
"loss": 0.3671,
"step": 1785
},
{
"epoch": 1.9238779174147216,
"grad_norm": 0.29276221990585327,
"learning_rate": 3.43107962207451e-06,
"loss": 0.396,
"step": 1786
},
{
"epoch": 1.924955116696589,
"grad_norm": 0.3173641264438629,
"learning_rate": 3.4251269162879826e-06,
"loss": 0.4481,
"step": 1787
},
{
"epoch": 1.926032315978456,
"grad_norm": 0.2623595893383026,
"learning_rate": 3.419176687523024e-06,
"loss": 0.3533,
"step": 1788
},
{
"epoch": 1.927109515260323,
"grad_norm": 0.310444712638855,
"learning_rate": 3.4132289451383866e-06,
"loss": 0.4047,
"step": 1789
},
{
"epoch": 1.9281867145421903,
"grad_norm": 0.29494354128837585,
"learning_rate": 3.4072836984889137e-06,
"loss": 0.3804,
"step": 1790
},
{
"epoch": 1.9292639138240575,
"grad_norm": 0.3028341233730316,
"learning_rate": 3.401340956925515e-06,
"loss": 0.414,
"step": 1791
},
{
"epoch": 1.9303411131059245,
"grad_norm": 0.26845231652259827,
"learning_rate": 3.3954007297951693e-06,
"loss": 0.3837,
"step": 1792
},
{
"epoch": 1.9314183123877917,
"grad_norm": 0.26886746287345886,
"learning_rate": 3.3894630264408936e-06,
"loss": 0.3843,
"step": 1793
},
{
"epoch": 1.932495511669659,
"grad_norm": 0.3210623860359192,
"learning_rate": 3.3835278562017405e-06,
"loss": 0.4072,
"step": 1794
},
{
"epoch": 1.933572710951526,
"grad_norm": 0.2861291468143463,
"learning_rate": 3.3775952284127766e-06,
"loss": 0.3779,
"step": 1795
},
{
"epoch": 1.9346499102333932,
"grad_norm": 0.2961629331111908,
"learning_rate": 3.3716651524050677e-06,
"loss": 0.439,
"step": 1796
},
{
"epoch": 1.9357271095152604,
"grad_norm": 0.3172108829021454,
"learning_rate": 3.3657376375056684e-06,
"loss": 0.4195,
"step": 1797
},
{
"epoch": 1.9368043087971274,
"grad_norm": 0.3115575313568115,
"learning_rate": 3.3598126930376055e-06,
"loss": 0.3932,
"step": 1798
},
{
"epoch": 1.9378815080789946,
"grad_norm": 0.2706233561038971,
"learning_rate": 3.353890328319861e-06,
"loss": 0.3559,
"step": 1799
},
{
"epoch": 1.9389587073608618,
"grad_norm": 0.3000124394893646,
"learning_rate": 3.347970552667361e-06,
"loss": 0.4006,
"step": 1800
},
{
"epoch": 1.9400359066427288,
"grad_norm": 0.338463693857193,
"learning_rate": 3.3420533753909566e-06,
"loss": 0.3986,
"step": 1801
},
{
"epoch": 1.941113105924596,
"grad_norm": 0.3336849808692932,
"learning_rate": 3.3361388057974148e-06,
"loss": 0.4297,
"step": 1802
},
{
"epoch": 1.9421903052064633,
"grad_norm": 0.2952408492565155,
"learning_rate": 3.3302268531894023e-06,
"loss": 0.3764,
"step": 1803
},
{
"epoch": 1.9432675044883303,
"grad_norm": 0.3435702919960022,
"learning_rate": 3.3243175268654656e-06,
"loss": 0.4352,
"step": 1804
},
{
"epoch": 1.9443447037701975,
"grad_norm": 0.3342050611972809,
"learning_rate": 3.3184108361200235e-06,
"loss": 0.3898,
"step": 1805
},
{
"epoch": 1.9454219030520647,
"grad_norm": 0.3066212832927704,
"learning_rate": 3.3125067902433482e-06,
"loss": 0.3899,
"step": 1806
},
{
"epoch": 1.9464991023339318,
"grad_norm": 0.3134807050228119,
"learning_rate": 3.306605398521555e-06,
"loss": 0.4091,
"step": 1807
},
{
"epoch": 1.947576301615799,
"grad_norm": 0.3158106803894043,
"learning_rate": 3.300706670236579e-06,
"loss": 0.3984,
"step": 1808
},
{
"epoch": 1.9486535008976662,
"grad_norm": 0.32776862382888794,
"learning_rate": 3.29481061466617e-06,
"loss": 0.3836,
"step": 1809
},
{
"epoch": 1.9497307001795332,
"grad_norm": 0.2883926331996918,
"learning_rate": 3.2889172410838755e-06,
"loss": 0.4111,
"step": 1810
},
{
"epoch": 1.9508078994614002,
"grad_norm": 0.32595670223236084,
"learning_rate": 3.283026558759019e-06,
"loss": 0.3967,
"step": 1811
},
{
"epoch": 1.9518850987432677,
"grad_norm": 0.3464212417602539,
"learning_rate": 3.2771385769566976e-06,
"loss": 0.4291,
"step": 1812
},
{
"epoch": 1.9529622980251347,
"grad_norm": 0.3280077874660492,
"learning_rate": 3.2712533049377543e-06,
"loss": 0.369,
"step": 1813
},
{
"epoch": 1.9540394973070017,
"grad_norm": 0.28756144642829895,
"learning_rate": 3.2653707519587756e-06,
"loss": 0.3762,
"step": 1814
},
{
"epoch": 1.9551166965888689,
"grad_norm": 0.29988667368888855,
"learning_rate": 3.259490927272071e-06,
"loss": 0.3832,
"step": 1815
},
{
"epoch": 1.956193895870736,
"grad_norm": 0.29910799860954285,
"learning_rate": 3.253613840125654e-06,
"loss": 0.407,
"step": 1816
},
{
"epoch": 1.9572710951526031,
"grad_norm": 0.3187161982059479,
"learning_rate": 3.2477394997632373e-06,
"loss": 0.4497,
"step": 1817
},
{
"epoch": 1.9583482944344703,
"grad_norm": 0.2946453094482422,
"learning_rate": 3.241867915424211e-06,
"loss": 0.3617,
"step": 1818
},
{
"epoch": 1.9594254937163376,
"grad_norm": 0.3105182349681854,
"learning_rate": 3.235999096343633e-06,
"loss": 0.3864,
"step": 1819
},
{
"epoch": 1.9605026929982046,
"grad_norm": 0.3276148736476898,
"learning_rate": 3.230133051752207e-06,
"loss": 0.4242,
"step": 1820
},
{
"epoch": 1.9615798922800718,
"grad_norm": 0.30322229862213135,
"learning_rate": 3.22426979087628e-06,
"loss": 0.3693,
"step": 1821
},
{
"epoch": 1.962657091561939,
"grad_norm": 0.3252699673175812,
"learning_rate": 3.2184093229378117e-06,
"loss": 0.4265,
"step": 1822
},
{
"epoch": 1.963734290843806,
"grad_norm": 0.3165656626224518,
"learning_rate": 3.212551657154376e-06,
"loss": 0.427,
"step": 1823
},
{
"epoch": 1.9648114901256732,
"grad_norm": 0.29955458641052246,
"learning_rate": 3.2066968027391377e-06,
"loss": 0.3604,
"step": 1824
},
{
"epoch": 1.9658886894075405,
"grad_norm": 0.3262845277786255,
"learning_rate": 3.200844768900837e-06,
"loss": 0.4178,
"step": 1825
},
{
"epoch": 1.9669658886894075,
"grad_norm": 0.3196795582771301,
"learning_rate": 3.1949955648437824e-06,
"loss": 0.3918,
"step": 1826
},
{
"epoch": 1.9680430879712747,
"grad_norm": 0.34281331300735474,
"learning_rate": 3.1891491997678302e-06,
"loss": 0.3778,
"step": 1827
},
{
"epoch": 1.969120287253142,
"grad_norm": 0.3226211369037628,
"learning_rate": 3.1833056828683673e-06,
"loss": 0.3971,
"step": 1828
},
{
"epoch": 1.970197486535009,
"grad_norm": 0.29890337586402893,
"learning_rate": 3.177465023336306e-06,
"loss": 0.3857,
"step": 1829
},
{
"epoch": 1.9712746858168761,
"grad_norm": 0.3117425739765167,
"learning_rate": 3.171627230358063e-06,
"loss": 0.4003,
"step": 1830
},
{
"epoch": 1.9723518850987434,
"grad_norm": 0.31731322407722473,
"learning_rate": 3.1657923131155444e-06,
"loss": 0.3518,
"step": 1831
},
{
"epoch": 1.9734290843806104,
"grad_norm": 0.3363853693008423,
"learning_rate": 3.1599602807861374e-06,
"loss": 0.395,
"step": 1832
},
{
"epoch": 1.9745062836624776,
"grad_norm": 0.3119727075099945,
"learning_rate": 3.1541311425426856e-06,
"loss": 0.4121,
"step": 1833
},
{
"epoch": 1.9755834829443448,
"grad_norm": 0.3245994746685028,
"learning_rate": 3.1483049075534853e-06,
"loss": 0.4198,
"step": 1834
},
{
"epoch": 1.9766606822262118,
"grad_norm": 0.30005812644958496,
"learning_rate": 3.1424815849822665e-06,
"loss": 0.3959,
"step": 1835
},
{
"epoch": 1.9777378815080788,
"grad_norm": 0.3131362199783325,
"learning_rate": 3.136661183988175e-06,
"loss": 0.409,
"step": 1836
},
{
"epoch": 1.9788150807899463,
"grad_norm": 0.30022886395454407,
"learning_rate": 3.130843713725765e-06,
"loss": 0.3991,
"step": 1837
},
{
"epoch": 1.9798922800718133,
"grad_norm": 0.3111904263496399,
"learning_rate": 3.12502918334498e-06,
"loss": 0.3878,
"step": 1838
},
{
"epoch": 1.9809694793536803,
"grad_norm": 0.326963871717453,
"learning_rate": 3.119217601991139e-06,
"loss": 0.4197,
"step": 1839
},
{
"epoch": 1.9820466786355477,
"grad_norm": 0.310465544462204,
"learning_rate": 3.1134089788049226e-06,
"loss": 0.4112,
"step": 1840
},
{
"epoch": 1.9831238779174147,
"grad_norm": 0.2793821692466736,
"learning_rate": 3.107603322922359e-06,
"loss": 0.3555,
"step": 1841
},
{
"epoch": 1.9842010771992817,
"grad_norm": 0.32500728964805603,
"learning_rate": 3.1018006434748115e-06,
"loss": 0.4168,
"step": 1842
},
{
"epoch": 1.985278276481149,
"grad_norm": 0.31032678484916687,
"learning_rate": 3.0960009495889564e-06,
"loss": 0.3892,
"step": 1843
},
{
"epoch": 1.9863554757630162,
"grad_norm": 0.3267922103404999,
"learning_rate": 3.090204250386779e-06,
"loss": 0.4229,
"step": 1844
},
{
"epoch": 1.9874326750448832,
"grad_norm": 0.3038681447505951,
"learning_rate": 3.084410554985553e-06,
"loss": 0.3881,
"step": 1845
},
{
"epoch": 1.9885098743267504,
"grad_norm": 0.3073458969593048,
"learning_rate": 3.078619872497827e-06,
"loss": 0.3925,
"step": 1846
},
{
"epoch": 1.9895870736086176,
"grad_norm": 0.2960895597934723,
"learning_rate": 3.072832212031413e-06,
"loss": 0.3823,
"step": 1847
},
{
"epoch": 1.9906642728904846,
"grad_norm": 0.32073989510536194,
"learning_rate": 3.0670475826893663e-06,
"loss": 0.414,
"step": 1848
},
{
"epoch": 1.9917414721723519,
"grad_norm": 0.2960592806339264,
"learning_rate": 3.0612659935699774e-06,
"loss": 0.3875,
"step": 1849
},
{
"epoch": 1.992818671454219,
"grad_norm": 0.2826525866985321,
"learning_rate": 3.055487453766755e-06,
"loss": 0.3314,
"step": 1850
},
{
"epoch": 1.993895870736086,
"grad_norm": 0.3213357627391815,
"learning_rate": 3.049711972368411e-06,
"loss": 0.4022,
"step": 1851
},
{
"epoch": 1.9949730700179533,
"grad_norm": 0.3375919759273529,
"learning_rate": 3.043939558458846e-06,
"loss": 0.4042,
"step": 1852
},
{
"epoch": 1.9960502692998205,
"grad_norm": 0.3088492453098297,
"learning_rate": 3.038170221117138e-06,
"loss": 0.4278,
"step": 1853
},
{
"epoch": 1.9971274685816875,
"grad_norm": 0.31650954484939575,
"learning_rate": 3.032403969417523e-06,
"loss": 0.3807,
"step": 1854
},
{
"epoch": 1.9982046678635548,
"grad_norm": 0.3157062232494354,
"learning_rate": 3.026640812429388e-06,
"loss": 0.4152,
"step": 1855
},
{
"epoch": 1.999281867145422,
"grad_norm": 0.292804479598999,
"learning_rate": 3.0208807592172486e-06,
"loss": 0.4024,
"step": 1856
},
{
"epoch": 2.000359066427289,
"grad_norm": 0.5354664325714111,
"learning_rate": 3.01512381884074e-06,
"loss": 0.6262,
"step": 1857
},
{
"epoch": 2.001436265709156,
"grad_norm": 0.32860124111175537,
"learning_rate": 3.0093700003546023e-06,
"loss": 0.3948,
"step": 1858
},
{
"epoch": 2.0025134649910235,
"grad_norm": 0.29116785526275635,
"learning_rate": 3.0036193128086667e-06,
"loss": 0.3398,
"step": 1859
},
{
"epoch": 2.0035906642728905,
"grad_norm": 0.3160048723220825,
"learning_rate": 2.9978717652478343e-06,
"loss": 0.3915,
"step": 1860
},
{
"epoch": 2.0046678635547575,
"grad_norm": 0.27923333644866943,
"learning_rate": 2.9921273667120736e-06,
"loss": 0.359,
"step": 1861
},
{
"epoch": 2.005745062836625,
"grad_norm": 0.3296511769294739,
"learning_rate": 2.986386126236398e-06,
"loss": 0.3732,
"step": 1862
},
{
"epoch": 2.006822262118492,
"grad_norm": 0.3194176256656647,
"learning_rate": 2.980648052850852e-06,
"loss": 0.3832,
"step": 1863
},
{
"epoch": 2.007899461400359,
"grad_norm": 0.33017516136169434,
"learning_rate": 2.9749131555805035e-06,
"loss": 0.3889,
"step": 1864
},
{
"epoch": 2.0089766606822264,
"grad_norm": 0.2907126843929291,
"learning_rate": 2.969181443445417e-06,
"loss": 0.3683,
"step": 1865
},
{
"epoch": 2.0100538599640934,
"grad_norm": 0.2984246015548706,
"learning_rate": 2.963452925460654e-06,
"loss": 0.3749,
"step": 1866
},
{
"epoch": 2.0111310592459604,
"grad_norm": 0.30581575632095337,
"learning_rate": 2.9577276106362523e-06,
"loss": 0.3946,
"step": 1867
},
{
"epoch": 2.012208258527828,
"grad_norm": 0.2967156171798706,
"learning_rate": 2.952005507977207e-06,
"loss": 0.3681,
"step": 1868
},
{
"epoch": 2.013285457809695,
"grad_norm": 0.3053143322467804,
"learning_rate": 2.946286626483463e-06,
"loss": 0.3785,
"step": 1869
},
{
"epoch": 2.014362657091562,
"grad_norm": 0.2774108350276947,
"learning_rate": 2.9405709751499017e-06,
"loss": 0.3286,
"step": 1870
},
{
"epoch": 2.0154398563734293,
"grad_norm": 0.3262989819049835,
"learning_rate": 2.9348585629663213e-06,
"loss": 0.3738,
"step": 1871
},
{
"epoch": 2.0165170556552963,
"grad_norm": 0.3032521903514862,
"learning_rate": 2.9291493989174234e-06,
"loss": 0.384,
"step": 1872
},
{
"epoch": 2.0175942549371633,
"grad_norm": 0.30339065194129944,
"learning_rate": 2.923443491982804e-06,
"loss": 0.3763,
"step": 1873
},
{
"epoch": 2.0186714542190307,
"grad_norm": 0.30798229575157166,
"learning_rate": 2.9177408511369395e-06,
"loss": 0.3555,
"step": 1874
},
{
"epoch": 2.0197486535008977,
"grad_norm": 0.29715853929519653,
"learning_rate": 2.9120414853491574e-06,
"loss": 0.3691,
"step": 1875
},
{
"epoch": 2.0208258527827647,
"grad_norm": 0.30873653292655945,
"learning_rate": 2.9063454035836447e-06,
"loss": 0.3791,
"step": 1876
},
{
"epoch": 2.021903052064632,
"grad_norm": 0.3008333742618561,
"learning_rate": 2.900652614799422e-06,
"loss": 0.3797,
"step": 1877
},
{
"epoch": 2.022980251346499,
"grad_norm": 0.3410451114177704,
"learning_rate": 2.8949631279503265e-06,
"loss": 0.3612,
"step": 1878
},
{
"epoch": 2.024057450628366,
"grad_norm": 0.3563460409641266,
"learning_rate": 2.889276951985005e-06,
"loss": 0.3818,
"step": 1879
},
{
"epoch": 2.025134649910233,
"grad_norm": 0.29962658882141113,
"learning_rate": 2.8835940958468954e-06,
"loss": 0.3624,
"step": 1880
},
{
"epoch": 2.0262118491921006,
"grad_norm": 0.3057413399219513,
"learning_rate": 2.877914568474218e-06,
"loss": 0.3688,
"step": 1881
},
{
"epoch": 2.0272890484739676,
"grad_norm": 0.32542818784713745,
"learning_rate": 2.872238378799949e-06,
"loss": 0.395,
"step": 1882
},
{
"epoch": 2.0283662477558346,
"grad_norm": 0.2896184027194977,
"learning_rate": 2.866565535751822e-06,
"loss": 0.364,
"step": 1883
},
{
"epoch": 2.029443447037702,
"grad_norm": 0.34391382336616516,
"learning_rate": 2.8608960482523058e-06,
"loss": 0.3982,
"step": 1884
},
{
"epoch": 2.030520646319569,
"grad_norm": 0.31426018476486206,
"learning_rate": 2.8552299252185915e-06,
"loss": 0.3758,
"step": 1885
},
{
"epoch": 2.031597845601436,
"grad_norm": 0.29396316409111023,
"learning_rate": 2.849567175562574e-06,
"loss": 0.3304,
"step": 1886
},
{
"epoch": 2.0326750448833035,
"grad_norm": 0.3297179043292999,
"learning_rate": 2.8439078081908487e-06,
"loss": 0.424,
"step": 1887
},
{
"epoch": 2.0337522441651705,
"grad_norm": 0.2860085368156433,
"learning_rate": 2.8382518320046877e-06,
"loss": 0.3485,
"step": 1888
},
{
"epoch": 2.0348294434470375,
"grad_norm": 0.31872880458831787,
"learning_rate": 2.8325992559000315e-06,
"loss": 0.3862,
"step": 1889
},
{
"epoch": 2.035906642728905,
"grad_norm": 0.30101990699768066,
"learning_rate": 2.826950088767469e-06,
"loss": 0.373,
"step": 1890
},
{
"epoch": 2.036983842010772,
"grad_norm": 0.3038983643054962,
"learning_rate": 2.82130433949223e-06,
"loss": 0.3708,
"step": 1891
},
{
"epoch": 2.038061041292639,
"grad_norm": 0.27234700322151184,
"learning_rate": 2.8156620169541698e-06,
"loss": 0.3362,
"step": 1892
},
{
"epoch": 2.0391382405745064,
"grad_norm": 0.3316778242588043,
"learning_rate": 2.8100231300277514e-06,
"loss": 0.3572,
"step": 1893
},
{
"epoch": 2.0402154398563734,
"grad_norm": 0.3273511528968811,
"learning_rate": 2.8043876875820363e-06,
"loss": 0.3761,
"step": 1894
},
{
"epoch": 2.0412926391382404,
"grad_norm": 0.3026241660118103,
"learning_rate": 2.798755698480668e-06,
"loss": 0.3997,
"step": 1895
},
{
"epoch": 2.042369838420108,
"grad_norm": 0.2999473512172699,
"learning_rate": 2.793127171581854e-06,
"loss": 0.3829,
"step": 1896
},
{
"epoch": 2.043447037701975,
"grad_norm": 0.31387123465538025,
"learning_rate": 2.7875021157383634e-06,
"loss": 0.3659,
"step": 1897
},
{
"epoch": 2.044524236983842,
"grad_norm": 0.3155645430088043,
"learning_rate": 2.7818805397975034e-06,
"loss": 0.3849,
"step": 1898
},
{
"epoch": 2.0456014362657093,
"grad_norm": 0.31519466638565063,
"learning_rate": 2.776262452601104e-06,
"loss": 0.3828,
"step": 1899
},
{
"epoch": 2.0466786355475763,
"grad_norm": 0.3476317226886749,
"learning_rate": 2.770647862985512e-06,
"loss": 0.4074,
"step": 1900
},
{
"epoch": 2.0477558348294433,
"grad_norm": 0.2855173349380493,
"learning_rate": 2.765036779781571e-06,
"loss": 0.3435,
"step": 1901
},
{
"epoch": 2.048833034111311,
"grad_norm": 0.3068677484989166,
"learning_rate": 2.7594292118146136e-06,
"loss": 0.4177,
"step": 1902
},
{
"epoch": 2.049910233393178,
"grad_norm": 0.30308929085731506,
"learning_rate": 2.753825167904438e-06,
"loss": 0.3721,
"step": 1903
},
{
"epoch": 2.050987432675045,
"grad_norm": 0.3041457235813141,
"learning_rate": 2.748224656865304e-06,
"loss": 0.3664,
"step": 1904
},
{
"epoch": 2.0520646319569122,
"grad_norm": 0.29610303044319153,
"learning_rate": 2.7426276875059145e-06,
"loss": 0.3573,
"step": 1905
},
{
"epoch": 2.0531418312387792,
"grad_norm": 0.2744855284690857,
"learning_rate": 2.737034268629397e-06,
"loss": 0.372,
"step": 1906
},
{
"epoch": 2.0542190305206462,
"grad_norm": 0.2901430130004883,
"learning_rate": 2.731444409033297e-06,
"loss": 0.3437,
"step": 1907
},
{
"epoch": 2.0552962298025133,
"grad_norm": 0.30917778611183167,
"learning_rate": 2.7258581175095657e-06,
"loss": 0.3958,
"step": 1908
},
{
"epoch": 2.0563734290843807,
"grad_norm": 0.30461421608924866,
"learning_rate": 2.7202754028445375e-06,
"loss": 0.3906,
"step": 1909
},
{
"epoch": 2.0574506283662477,
"grad_norm": 0.2898961007595062,
"learning_rate": 2.7146962738189254e-06,
"loss": 0.3711,
"step": 1910
},
{
"epoch": 2.0585278276481147,
"grad_norm": 0.3014693856239319,
"learning_rate": 2.709120739207798e-06,
"loss": 0.3599,
"step": 1911
},
{
"epoch": 2.059605026929982,
"grad_norm": 0.34578463435173035,
"learning_rate": 2.7035488077805736e-06,
"loss": 0.4243,
"step": 1912
},
{
"epoch": 2.060682226211849,
"grad_norm": 0.2798299789428711,
"learning_rate": 2.6979804883010052e-06,
"loss": 0.3368,
"step": 1913
},
{
"epoch": 2.061759425493716,
"grad_norm": 0.31570667028427124,
"learning_rate": 2.6924157895271563e-06,
"loss": 0.4181,
"step": 1914
},
{
"epoch": 2.0628366247755836,
"grad_norm": 0.27935585379600525,
"learning_rate": 2.6868547202114047e-06,
"loss": 0.358,
"step": 1915
},
{
"epoch": 2.0639138240574506,
"grad_norm": 0.29281607270240784,
"learning_rate": 2.681297289100418e-06,
"loss": 0.393,
"step": 1916
},
{
"epoch": 2.0649910233393176,
"grad_norm": 0.2860967516899109,
"learning_rate": 2.6757435049351353e-06,
"loss": 0.3768,
"step": 1917
},
{
"epoch": 2.066068222621185,
"grad_norm": 0.3133350610733032,
"learning_rate": 2.670193376450767e-06,
"loss": 0.3925,
"step": 1918
},
{
"epoch": 2.067145421903052,
"grad_norm": 0.33049872517585754,
"learning_rate": 2.6646469123767694e-06,
"loss": 0.3693,
"step": 1919
},
{
"epoch": 2.068222621184919,
"grad_norm": 0.3234219253063202,
"learning_rate": 2.6591041214368383e-06,
"loss": 0.3943,
"step": 1920
},
{
"epoch": 2.0692998204667865,
"grad_norm": 0.2815258204936981,
"learning_rate": 2.6535650123488916e-06,
"loss": 0.3661,
"step": 1921
},
{
"epoch": 2.0703770197486535,
"grad_norm": 0.33411794900894165,
"learning_rate": 2.648029593825051e-06,
"loss": 0.3803,
"step": 1922
},
{
"epoch": 2.0714542190305205,
"grad_norm": 0.3197128474712372,
"learning_rate": 2.642497874571641e-06,
"loss": 0.3744,
"step": 1923
},
{
"epoch": 2.072531418312388,
"grad_norm": 0.29649585485458374,
"learning_rate": 2.636969863289164e-06,
"loss": 0.3939,
"step": 1924
},
{
"epoch": 2.073608617594255,
"grad_norm": 0.28606873750686646,
"learning_rate": 2.63144556867229e-06,
"loss": 0.3692,
"step": 1925
},
{
"epoch": 2.074685816876122,
"grad_norm": 0.2942201793193817,
"learning_rate": 2.6259249994098457e-06,
"loss": 0.3543,
"step": 1926
},
{
"epoch": 2.0757630161579894,
"grad_norm": 0.30836036801338196,
"learning_rate": 2.620408164184798e-06,
"loss": 0.3723,
"step": 1927
},
{
"epoch": 2.0768402154398564,
"grad_norm": 0.2905234396457672,
"learning_rate": 2.614895071674235e-06,
"loss": 0.3891,
"step": 1928
},
{
"epoch": 2.0779174147217234,
"grad_norm": 0.2854272425174713,
"learning_rate": 2.6093857305493666e-06,
"loss": 0.3601,
"step": 1929
},
{
"epoch": 2.078994614003591,
"grad_norm": 0.29839953780174255,
"learning_rate": 2.6038801494754994e-06,
"loss": 0.3751,
"step": 1930
},
{
"epoch": 2.080071813285458,
"grad_norm": 0.2869094908237457,
"learning_rate": 2.5983783371120214e-06,
"loss": 0.3696,
"step": 1931
},
{
"epoch": 2.081149012567325,
"grad_norm": 0.30146855115890503,
"learning_rate": 2.592880302112399e-06,
"loss": 0.404,
"step": 1932
},
{
"epoch": 2.082226211849192,
"grad_norm": 0.2710917890071869,
"learning_rate": 2.5873860531241544e-06,
"loss": 0.3731,
"step": 1933
},
{
"epoch": 2.0833034111310593,
"grad_norm": 0.29889118671417236,
"learning_rate": 2.581895598788857e-06,
"loss": 0.4159,
"step": 1934
},
{
"epoch": 2.0843806104129263,
"grad_norm": 0.2817692756652832,
"learning_rate": 2.5764089477421067e-06,
"loss": 0.3721,
"step": 1935
},
{
"epoch": 2.0854578096947933,
"grad_norm": 0.27690258622169495,
"learning_rate": 2.570926108613521e-06,
"loss": 0.3571,
"step": 1936
},
{
"epoch": 2.0865350089766608,
"grad_norm": 0.28059402108192444,
"learning_rate": 2.565447090026724e-06,
"loss": 0.3534,
"step": 1937
},
{
"epoch": 2.087612208258528,
"grad_norm": 0.3004077672958374,
"learning_rate": 2.5599719005993264e-06,
"loss": 0.4138,
"step": 1938
},
{
"epoch": 2.088689407540395,
"grad_norm": 0.31813308596611023,
"learning_rate": 2.5545005489429185e-06,
"loss": 0.3837,
"step": 1939
},
{
"epoch": 2.0897666068222622,
"grad_norm": 0.27622514963150024,
"learning_rate": 2.5490330436630563e-06,
"loss": 0.3602,
"step": 1940
},
{
"epoch": 2.0908438061041292,
"grad_norm": 0.2941945195198059,
"learning_rate": 2.543569393359243e-06,
"loss": 0.3663,
"step": 1941
},
{
"epoch": 2.0919210053859962,
"grad_norm": 0.31598734855651855,
"learning_rate": 2.538109606624922e-06,
"loss": 0.3766,
"step": 1942
},
{
"epoch": 2.0929982046678637,
"grad_norm": 0.29513019323349,
"learning_rate": 2.5326536920474576e-06,
"loss": 0.4152,
"step": 1943
},
{
"epoch": 2.0940754039497307,
"grad_norm": 0.27299997210502625,
"learning_rate": 2.5272016582081236e-06,
"loss": 0.3941,
"step": 1944
},
{
"epoch": 2.0951526032315977,
"grad_norm": 0.27696993947029114,
"learning_rate": 2.5217535136820927e-06,
"loss": 0.3733,
"step": 1945
},
{
"epoch": 2.096229802513465,
"grad_norm": 0.29740098118782043,
"learning_rate": 2.5163092670384144e-06,
"loss": 0.3629,
"step": 1946
},
{
"epoch": 2.097307001795332,
"grad_norm": 0.2868814468383789,
"learning_rate": 2.5108689268400133e-06,
"loss": 0.3731,
"step": 1947
},
{
"epoch": 2.098384201077199,
"grad_norm": 0.2974447011947632,
"learning_rate": 2.50543250164367e-06,
"loss": 0.3763,
"step": 1948
},
{
"epoch": 2.0994614003590666,
"grad_norm": 0.30036285519599915,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.3798,
"step": 1949
},
{
"epoch": 2.1005385996409336,
"grad_norm": 0.2924955189228058,
"learning_rate": 2.4945714304534584e-06,
"loss": 0.3671,
"step": 1950
},
{
"epoch": 2.1016157989228006,
"grad_norm": 0.2933090925216675,
"learning_rate": 2.489146801542307e-06,
"loss": 0.3877,
"step": 1951
},
{
"epoch": 2.102692998204668,
"grad_norm": 0.2935760021209717,
"learning_rate": 2.4837261217986134e-06,
"loss": 0.3562,
"step": 1952
},
{
"epoch": 2.103770197486535,
"grad_norm": 0.2789822518825531,
"learning_rate": 2.4783093997482367e-06,
"loss": 0.3356,
"step": 1953
},
{
"epoch": 2.104847396768402,
"grad_norm": 0.33238011598587036,
"learning_rate": 2.472896643910802e-06,
"loss": 0.4118,
"step": 1954
},
{
"epoch": 2.1059245960502695,
"grad_norm": 0.2929084002971649,
"learning_rate": 2.4674878627997053e-06,
"loss": 0.3679,
"step": 1955
},
{
"epoch": 2.1070017953321365,
"grad_norm": 0.27412042021751404,
"learning_rate": 2.4620830649220874e-06,
"loss": 0.3467,
"step": 1956
},
{
"epoch": 2.1080789946140035,
"grad_norm": 0.43696802854537964,
"learning_rate": 2.4566822587788234e-06,
"loss": 0.3829,
"step": 1957
},
{
"epoch": 2.109156193895871,
"grad_norm": 0.2916047275066376,
"learning_rate": 2.4512854528645143e-06,
"loss": 0.3748,
"step": 1958
},
{
"epoch": 2.110233393177738,
"grad_norm": 0.2847961187362671,
"learning_rate": 2.445892655667462e-06,
"loss": 0.3807,
"step": 1959
},
{
"epoch": 2.111310592459605,
"grad_norm": 0.3016386330127716,
"learning_rate": 2.440503875669668e-06,
"loss": 0.3699,
"step": 1960
},
{
"epoch": 2.1123877917414724,
"grad_norm": 0.2906850576400757,
"learning_rate": 2.435119121346817e-06,
"loss": 0.3623,
"step": 1961
},
{
"epoch": 2.1134649910233394,
"grad_norm": 0.2936685085296631,
"learning_rate": 2.4297384011682594e-06,
"loss": 0.3895,
"step": 1962
},
{
"epoch": 2.1145421903052064,
"grad_norm": 0.28310006856918335,
"learning_rate": 2.4243617235969996e-06,
"loss": 0.3601,
"step": 1963
},
{
"epoch": 2.1156193895870734,
"grad_norm": 0.30220288038253784,
"learning_rate": 2.418989097089685e-06,
"loss": 0.3806,
"step": 1964
},
{
"epoch": 2.116696588868941,
"grad_norm": 0.29966917634010315,
"learning_rate": 2.413620530096592e-06,
"loss": 0.3688,
"step": 1965
},
{
"epoch": 2.117773788150808,
"grad_norm": 0.28805071115493774,
"learning_rate": 2.408256031061611e-06,
"loss": 0.3635,
"step": 1966
},
{
"epoch": 2.118850987432675,
"grad_norm": 0.3081320524215698,
"learning_rate": 2.402895608422235e-06,
"loss": 0.4241,
"step": 1967
},
{
"epoch": 2.1199281867145423,
"grad_norm": 0.26345717906951904,
"learning_rate": 2.3975392706095447e-06,
"loss": 0.3134,
"step": 1968
},
{
"epoch": 2.1210053859964093,
"grad_norm": 0.35153645277023315,
"learning_rate": 2.392187026048198e-06,
"loss": 0.4158,
"step": 1969
},
{
"epoch": 2.1220825852782763,
"grad_norm": 0.3314216434955597,
"learning_rate": 2.386838883156412e-06,
"loss": 0.4069,
"step": 1970
},
{
"epoch": 2.1231597845601438,
"grad_norm": 0.2866481840610504,
"learning_rate": 2.3814948503459504e-06,
"loss": 0.3425,
"step": 1971
},
{
"epoch": 2.1242369838420108,
"grad_norm": 0.31775057315826416,
"learning_rate": 2.376154936022119e-06,
"loss": 0.3743,
"step": 1972
},
{
"epoch": 2.1253141831238778,
"grad_norm": 0.30802255868911743,
"learning_rate": 2.370819148583741e-06,
"loss": 0.4065,
"step": 1973
},
{
"epoch": 2.126391382405745,
"grad_norm": 0.2827637493610382,
"learning_rate": 2.365487496423152e-06,
"loss": 0.3773,
"step": 1974
},
{
"epoch": 2.127468581687612,
"grad_norm": 0.2885690927505493,
"learning_rate": 2.3601599879261794e-06,
"loss": 0.3828,
"step": 1975
},
{
"epoch": 2.128545780969479,
"grad_norm": 0.3132476806640625,
"learning_rate": 2.3548366314721373e-06,
"loss": 0.3976,
"step": 1976
},
{
"epoch": 2.1296229802513467,
"grad_norm": 0.2819250226020813,
"learning_rate": 2.3495174354338083e-06,
"loss": 0.3635,
"step": 1977
},
{
"epoch": 2.1307001795332137,
"grad_norm": 0.2861512303352356,
"learning_rate": 2.344202408177428e-06,
"loss": 0.3617,
"step": 1978
},
{
"epoch": 2.1317773788150807,
"grad_norm": 0.3052145540714264,
"learning_rate": 2.3388915580626807e-06,
"loss": 0.3758,
"step": 1979
},
{
"epoch": 2.132854578096948,
"grad_norm": 0.27510857582092285,
"learning_rate": 2.333584893442675e-06,
"loss": 0.3805,
"step": 1980
},
{
"epoch": 2.133931777378815,
"grad_norm": 0.30943563580513,
"learning_rate": 2.3282824226639393e-06,
"loss": 0.3907,
"step": 1981
},
{
"epoch": 2.135008976660682,
"grad_norm": 0.29370906949043274,
"learning_rate": 2.3229841540664065e-06,
"loss": 0.3815,
"step": 1982
},
{
"epoch": 2.1360861759425496,
"grad_norm": 0.27837884426116943,
"learning_rate": 2.3176900959834e-06,
"loss": 0.3548,
"step": 1983
},
{
"epoch": 2.1371633752244166,
"grad_norm": 0.26997843384742737,
"learning_rate": 2.31240025674162e-06,
"loss": 0.364,
"step": 1984
},
{
"epoch": 2.1382405745062836,
"grad_norm": 0.3154236674308777,
"learning_rate": 2.3071146446611313e-06,
"loss": 0.3943,
"step": 1985
},
{
"epoch": 2.139317773788151,
"grad_norm": 0.317594975233078,
"learning_rate": 2.3018332680553478e-06,
"loss": 0.4027,
"step": 1986
},
{
"epoch": 2.140394973070018,
"grad_norm": 0.29514530301094055,
"learning_rate": 2.2965561352310257e-06,
"loss": 0.3693,
"step": 1987
},
{
"epoch": 2.141472172351885,
"grad_norm": 0.2793351411819458,
"learning_rate": 2.2912832544882434e-06,
"loss": 0.3678,
"step": 1988
},
{
"epoch": 2.142549371633752,
"grad_norm": 0.31178465485572815,
"learning_rate": 2.2860146341203936e-06,
"loss": 0.3883,
"step": 1989
},
{
"epoch": 2.1436265709156195,
"grad_norm": 0.27136895060539246,
"learning_rate": 2.280750282414169e-06,
"loss": 0.3579,
"step": 1990
},
{
"epoch": 2.1447037701974865,
"grad_norm": 0.27261051535606384,
"learning_rate": 2.2754902076495424e-06,
"loss": 0.367,
"step": 1991
},
{
"epoch": 2.1457809694793535,
"grad_norm": 0.2901833653450012,
"learning_rate": 2.270234418099765e-06,
"loss": 0.3525,
"step": 1992
},
{
"epoch": 2.146858168761221,
"grad_norm": 0.3003911077976227,
"learning_rate": 2.264982922031348e-06,
"loss": 0.4071,
"step": 1993
},
{
"epoch": 2.147935368043088,
"grad_norm": 0.2801694869995117,
"learning_rate": 2.2597357277040494e-06,
"loss": 0.4014,
"step": 1994
},
{
"epoch": 2.149012567324955,
"grad_norm": 0.27856895327568054,
"learning_rate": 2.254492843370857e-06,
"loss": 0.3261,
"step": 1995
},
{
"epoch": 2.1500897666068224,
"grad_norm": 0.31977221369743347,
"learning_rate": 2.249254277277984e-06,
"loss": 0.4686,
"step": 1996
},
{
"epoch": 2.1511669658886894,
"grad_norm": 0.2627192437648773,
"learning_rate": 2.2440200376648524e-06,
"loss": 0.3757,
"step": 1997
},
{
"epoch": 2.1522441651705564,
"grad_norm": 0.2889450192451477,
"learning_rate": 2.238790132764076e-06,
"loss": 0.3737,
"step": 1998
},
{
"epoch": 2.153321364452424,
"grad_norm": 0.267654150724411,
"learning_rate": 2.233564570801453e-06,
"loss": 0.3646,
"step": 1999
},
{
"epoch": 2.154398563734291,
"grad_norm": 0.30864495038986206,
"learning_rate": 2.2283433599959525e-06,
"loss": 0.3846,
"step": 2000
},
{
"epoch": 2.155475763016158,
"grad_norm": 0.3336288332939148,
"learning_rate": 2.2231265085596935e-06,
"loss": 0.3714,
"step": 2001
},
{
"epoch": 2.1565529622980253,
"grad_norm": 0.2831747531890869,
"learning_rate": 2.2179140246979463e-06,
"loss": 0.3615,
"step": 2002
},
{
"epoch": 2.1576301615798923,
"grad_norm": 0.2831881046295166,
"learning_rate": 2.2127059166091046e-06,
"loss": 0.3555,
"step": 2003
},
{
"epoch": 2.1587073608617593,
"grad_norm": 0.3034612238407135,
"learning_rate": 2.207502192484685e-06,
"loss": 0.3775,
"step": 2004
},
{
"epoch": 2.1597845601436267,
"grad_norm": 0.2869066298007965,
"learning_rate": 2.202302860509307e-06,
"loss": 0.389,
"step": 2005
},
{
"epoch": 2.1608617594254937,
"grad_norm": 0.3017926514148712,
"learning_rate": 2.1971079288606813e-06,
"loss": 0.3694,
"step": 2006
},
{
"epoch": 2.1619389587073607,
"grad_norm": 0.28795140981674194,
"learning_rate": 2.191917405709598e-06,
"loss": 0.3491,
"step": 2007
},
{
"epoch": 2.163016157989228,
"grad_norm": 0.30164435505867004,
"learning_rate": 2.186731299219915e-06,
"loss": 0.402,
"step": 2008
},
{
"epoch": 2.164093357271095,
"grad_norm": 0.30009475350379944,
"learning_rate": 2.1815496175485433e-06,
"loss": 0.3717,
"step": 2009
},
{
"epoch": 2.165170556552962,
"grad_norm": 0.3006815016269684,
"learning_rate": 2.1763723688454297e-06,
"loss": 0.3623,
"step": 2010
},
{
"epoch": 2.1662477558348296,
"grad_norm": 0.3288932144641876,
"learning_rate": 2.1711995612535547e-06,
"loss": 0.376,
"step": 2011
},
{
"epoch": 2.1673249551166966,
"grad_norm": 0.29935574531555176,
"learning_rate": 2.1660312029089083e-06,
"loss": 0.3943,
"step": 2012
},
{
"epoch": 2.1684021543985637,
"grad_norm": 0.2717684209346771,
"learning_rate": 2.1608673019404867e-06,
"loss": 0.38,
"step": 2013
},
{
"epoch": 2.1694793536804307,
"grad_norm": 0.29062172770500183,
"learning_rate": 2.1557078664702747e-06,
"loss": 0.3701,
"step": 2014
},
{
"epoch": 2.170556552962298,
"grad_norm": 0.28731608390808105,
"learning_rate": 2.1505529046132316e-06,
"loss": 0.3816,
"step": 2015
},
{
"epoch": 2.171633752244165,
"grad_norm": 0.29999956488609314,
"learning_rate": 2.145402424477283e-06,
"loss": 0.4037,
"step": 2016
},
{
"epoch": 2.172710951526032,
"grad_norm": 0.2899573743343353,
"learning_rate": 2.140256434163303e-06,
"loss": 0.3361,
"step": 2017
},
{
"epoch": 2.1737881508078996,
"grad_norm": 0.3206027150154114,
"learning_rate": 2.135114941765108e-06,
"loss": 0.3869,
"step": 2018
},
{
"epoch": 2.1748653500897666,
"grad_norm": 0.30229973793029785,
"learning_rate": 2.1299779553694323e-06,
"loss": 0.3607,
"step": 2019
},
{
"epoch": 2.1759425493716336,
"grad_norm": 0.2830967903137207,
"learning_rate": 2.1248454830559307e-06,
"loss": 0.4132,
"step": 2020
},
{
"epoch": 2.177019748653501,
"grad_norm": 0.2616431415081024,
"learning_rate": 2.119717532897155e-06,
"loss": 0.351,
"step": 2021
},
{
"epoch": 2.178096947935368,
"grad_norm": 0.2953941524028778,
"learning_rate": 2.1145941129585434e-06,
"loss": 0.4132,
"step": 2022
},
{
"epoch": 2.179174147217235,
"grad_norm": 0.27539801597595215,
"learning_rate": 2.1094752312984096e-06,
"loss": 0.336,
"step": 2023
},
{
"epoch": 2.1802513464991025,
"grad_norm": 0.30243703722953796,
"learning_rate": 2.1043608959679302e-06,
"loss": 0.3641,
"step": 2024
},
{
"epoch": 2.1813285457809695,
"grad_norm": 0.3062271475791931,
"learning_rate": 2.09925111501113e-06,
"loss": 0.389,
"step": 2025
},
{
"epoch": 2.1824057450628365,
"grad_norm": 0.2824309170246124,
"learning_rate": 2.0941458964648737e-06,
"loss": 0.3559,
"step": 2026
},
{
"epoch": 2.183482944344704,
"grad_norm": 0.30429500341415405,
"learning_rate": 2.0890452483588434e-06,
"loss": 0.3657,
"step": 2027
},
{
"epoch": 2.184560143626571,
"grad_norm": 0.2938910722732544,
"learning_rate": 2.0839491787155387e-06,
"loss": 0.3697,
"step": 2028
},
{
"epoch": 2.185637342908438,
"grad_norm": 0.3032033443450928,
"learning_rate": 2.0788576955502547e-06,
"loss": 0.3612,
"step": 2029
},
{
"epoch": 2.1867145421903054,
"grad_norm": 0.3238908648490906,
"learning_rate": 2.0737708068710753e-06,
"loss": 0.3952,
"step": 2030
},
{
"epoch": 2.1877917414721724,
"grad_norm": 0.3095376193523407,
"learning_rate": 2.0686885206788563e-06,
"loss": 0.4077,
"step": 2031
},
{
"epoch": 2.1888689407540394,
"grad_norm": 0.27973851561546326,
"learning_rate": 2.0636108449672167e-06,
"loss": 0.3812,
"step": 2032
},
{
"epoch": 2.189946140035907,
"grad_norm": 0.2777497172355652,
"learning_rate": 2.0585377877225176e-06,
"loss": 0.3681,
"step": 2033
},
{
"epoch": 2.191023339317774,
"grad_norm": 0.2898228168487549,
"learning_rate": 2.053469356923865e-06,
"loss": 0.3811,
"step": 2034
},
{
"epoch": 2.192100538599641,
"grad_norm": 0.296670138835907,
"learning_rate": 2.0484055605430807e-06,
"loss": 0.3853,
"step": 2035
},
{
"epoch": 2.1931777378815083,
"grad_norm": 0.31800577044487,
"learning_rate": 2.043346406544701e-06,
"loss": 0.3901,
"step": 2036
},
{
"epoch": 2.1942549371633753,
"grad_norm": 0.2928822636604309,
"learning_rate": 2.0382919028859606e-06,
"loss": 0.3608,
"step": 2037
},
{
"epoch": 2.1953321364452423,
"grad_norm": 0.2876681685447693,
"learning_rate": 2.033242057516779e-06,
"loss": 0.3788,
"step": 2038
},
{
"epoch": 2.1964093357271093,
"grad_norm": 0.2682286202907562,
"learning_rate": 2.028196878379749e-06,
"loss": 0.3644,
"step": 2039
},
{
"epoch": 2.1974865350089767,
"grad_norm": 0.2802661061286926,
"learning_rate": 2.0231563734101245e-06,
"loss": 0.3386,
"step": 2040
},
{
"epoch": 2.1985637342908437,
"grad_norm": 0.258585125207901,
"learning_rate": 2.0181205505358098e-06,
"loss": 0.3483,
"step": 2041
},
{
"epoch": 2.199640933572711,
"grad_norm": 0.2854132652282715,
"learning_rate": 2.013089417677338e-06,
"loss": 0.4009,
"step": 2042
},
{
"epoch": 2.200718132854578,
"grad_norm": 0.29215067625045776,
"learning_rate": 2.0080629827478755e-06,
"loss": 0.3803,
"step": 2043
},
{
"epoch": 2.201795332136445,
"grad_norm": 0.2704610526561737,
"learning_rate": 2.0030412536531896e-06,
"loss": 0.3511,
"step": 2044
},
{
"epoch": 2.202872531418312,
"grad_norm": 0.2924228608608246,
"learning_rate": 1.998024238291653e-06,
"loss": 0.3665,
"step": 2045
},
{
"epoch": 2.2039497307001796,
"grad_norm": 0.28420692682266235,
"learning_rate": 1.993011944554223e-06,
"loss": 0.3589,
"step": 2046
},
{
"epoch": 2.2050269299820466,
"grad_norm": 0.2870817482471466,
"learning_rate": 1.9880043803244285e-06,
"loss": 0.3864,
"step": 2047
},
{
"epoch": 2.2061041292639136,
"grad_norm": 0.2669335603713989,
"learning_rate": 1.9830015534783626e-06,
"loss": 0.3774,
"step": 2048
},
{
"epoch": 2.207181328545781,
"grad_norm": 0.28127819299697876,
"learning_rate": 1.9780034718846653e-06,
"loss": 0.3582,
"step": 2049
},
{
"epoch": 2.208258527827648,
"grad_norm": 0.3107064366340637,
"learning_rate": 1.9730101434045146e-06,
"loss": 0.4,
"step": 2050
},
{
"epoch": 2.209335727109515,
"grad_norm": 0.2876104414463043,
"learning_rate": 1.968021575891609e-06,
"loss": 0.3549,
"step": 2051
},
{
"epoch": 2.2104129263913825,
"grad_norm": 0.2810943126678467,
"learning_rate": 1.9630377771921624e-06,
"loss": 0.3469,
"step": 2052
},
{
"epoch": 2.2114901256732495,
"grad_norm": 0.33290818333625793,
"learning_rate": 1.9580587551448887e-06,
"loss": 0.3922,
"step": 2053
},
{
"epoch": 2.2125673249551165,
"grad_norm": 0.2748958170413971,
"learning_rate": 1.9530845175809838e-06,
"loss": 0.3654,
"step": 2054
},
{
"epoch": 2.213644524236984,
"grad_norm": 0.30484747886657715,
"learning_rate": 1.9481150723241236e-06,
"loss": 0.4034,
"step": 2055
},
{
"epoch": 2.214721723518851,
"grad_norm": 0.2513694763183594,
"learning_rate": 1.943150427190445e-06,
"loss": 0.3236,
"step": 2056
},
{
"epoch": 2.215798922800718,
"grad_norm": 0.3133890628814697,
"learning_rate": 1.9381905899885344e-06,
"loss": 0.4458,
"step": 2057
},
{
"epoch": 2.2168761220825854,
"grad_norm": 0.3194190561771393,
"learning_rate": 1.9332355685194182e-06,
"loss": 0.3784,
"step": 2058
},
{
"epoch": 2.2179533213644524,
"grad_norm": 0.2863844037055969,
"learning_rate": 1.9282853705765435e-06,
"loss": 0.3491,
"step": 2059
},
{
"epoch": 2.2190305206463194,
"grad_norm": 0.28616365790367126,
"learning_rate": 1.923340003945775e-06,
"loss": 0.3826,
"step": 2060
},
{
"epoch": 2.220107719928187,
"grad_norm": 0.27722132205963135,
"learning_rate": 1.918399476405378e-06,
"loss": 0.4063,
"step": 2061
},
{
"epoch": 2.221184919210054,
"grad_norm": 0.2680579423904419,
"learning_rate": 1.913463795726007e-06,
"loss": 0.3643,
"step": 2062
},
{
"epoch": 2.222262118491921,
"grad_norm": 0.30088430643081665,
"learning_rate": 1.90853296967069e-06,
"loss": 0.3749,
"step": 2063
},
{
"epoch": 2.2233393177737883,
"grad_norm": 0.2925657033920288,
"learning_rate": 1.9036070059948253e-06,
"loss": 0.3703,
"step": 2064
},
{
"epoch": 2.2244165170556554,
"grad_norm": 0.2966391146183014,
"learning_rate": 1.898685912446156e-06,
"loss": 0.3929,
"step": 2065
},
{
"epoch": 2.2254937163375224,
"grad_norm": 0.2713056206703186,
"learning_rate": 1.8937696967647735e-06,
"loss": 0.3553,
"step": 2066
},
{
"epoch": 2.22657091561939,
"grad_norm": 0.30269116163253784,
"learning_rate": 1.8888583666830878e-06,
"loss": 0.3775,
"step": 2067
},
{
"epoch": 2.227648114901257,
"grad_norm": 0.30684664845466614,
"learning_rate": 1.8839519299258325e-06,
"loss": 0.3848,
"step": 2068
},
{
"epoch": 2.228725314183124,
"grad_norm": 0.27545851469039917,
"learning_rate": 1.8790503942100413e-06,
"loss": 0.3534,
"step": 2069
},
{
"epoch": 2.229802513464991,
"grad_norm": 0.29838407039642334,
"learning_rate": 1.8741537672450406e-06,
"loss": 0.3833,
"step": 2070
},
{
"epoch": 2.2308797127468583,
"grad_norm": 0.29707691073417664,
"learning_rate": 1.8692620567324354e-06,
"loss": 0.3847,
"step": 2071
},
{
"epoch": 2.2319569120287253,
"grad_norm": 0.31610336899757385,
"learning_rate": 1.8643752703660978e-06,
"loss": 0.3871,
"step": 2072
},
{
"epoch": 2.2330341113105923,
"grad_norm": 0.2851659655570984,
"learning_rate": 1.859493415832157e-06,
"loss": 0.3901,
"step": 2073
},
{
"epoch": 2.2341113105924597,
"grad_norm": 0.3058148920536041,
"learning_rate": 1.8546165008089806e-06,
"loss": 0.4152,
"step": 2074
},
{
"epoch": 2.2351885098743267,
"grad_norm": 0.2662912905216217,
"learning_rate": 1.8497445329671725e-06,
"loss": 0.3692,
"step": 2075
},
{
"epoch": 2.2362657091561937,
"grad_norm": 0.2919093370437622,
"learning_rate": 1.8448775199695501e-06,
"loss": 0.3102,
"step": 2076
},
{
"epoch": 2.237342908438061,
"grad_norm": 0.29858067631721497,
"learning_rate": 1.8400154694711424e-06,
"loss": 0.4039,
"step": 2077
},
{
"epoch": 2.238420107719928,
"grad_norm": 0.2683444023132324,
"learning_rate": 1.835158389119171e-06,
"loss": 0.3416,
"step": 2078
},
{
"epoch": 2.239497307001795,
"grad_norm": 0.29827025532722473,
"learning_rate": 1.8303062865530407e-06,
"loss": 0.4094,
"step": 2079
},
{
"epoch": 2.2405745062836626,
"grad_norm": 0.3107103705406189,
"learning_rate": 1.8254591694043267e-06,
"loss": 0.3767,
"step": 2080
},
{
"epoch": 2.2416517055655296,
"grad_norm": 0.3018888831138611,
"learning_rate": 1.8206170452967636e-06,
"loss": 0.3604,
"step": 2081
},
{
"epoch": 2.2427289048473966,
"grad_norm": 0.2799268662929535,
"learning_rate": 1.8157799218462335e-06,
"loss": 0.3684,
"step": 2082
},
{
"epoch": 2.243806104129264,
"grad_norm": 0.29917651414871216,
"learning_rate": 1.8109478066607495e-06,
"loss": 0.3752,
"step": 2083
},
{
"epoch": 2.244883303411131,
"grad_norm": 0.2883894741535187,
"learning_rate": 1.8061207073404507e-06,
"loss": 0.3911,
"step": 2084
},
{
"epoch": 2.245960502692998,
"grad_norm": 0.29146096110343933,
"learning_rate": 1.8012986314775888e-06,
"loss": 0.3613,
"step": 2085
},
{
"epoch": 2.2470377019748655,
"grad_norm": 0.3059714734554291,
"learning_rate": 1.7964815866565088e-06,
"loss": 0.3653,
"step": 2086
},
{
"epoch": 2.2481149012567325,
"grad_norm": 0.310809463262558,
"learning_rate": 1.7916695804536477e-06,
"loss": 0.3495,
"step": 2087
},
{
"epoch": 2.2491921005385995,
"grad_norm": 0.2690834701061249,
"learning_rate": 1.786862620437515e-06,
"loss": 0.348,
"step": 2088
},
{
"epoch": 2.250269299820467,
"grad_norm": 0.30786246061325073,
"learning_rate": 1.7820607141686846e-06,
"loss": 0.4014,
"step": 2089
},
{
"epoch": 2.251346499102334,
"grad_norm": 0.2623322010040283,
"learning_rate": 1.7772638691997835e-06,
"loss": 0.3643,
"step": 2090
},
{
"epoch": 2.252423698384201,
"grad_norm": 0.29593855142593384,
"learning_rate": 1.7724720930754713e-06,
"loss": 0.3535,
"step": 2091
},
{
"epoch": 2.2535008976660684,
"grad_norm": 0.30418071150779724,
"learning_rate": 1.7676853933324423e-06,
"loss": 0.3811,
"step": 2092
},
{
"epoch": 2.2545780969479354,
"grad_norm": 0.31614336371421814,
"learning_rate": 1.762903777499404e-06,
"loss": 0.3894,
"step": 2093
},
{
"epoch": 2.2556552962298024,
"grad_norm": 0.2974374294281006,
"learning_rate": 1.7581272530970666e-06,
"loss": 0.3793,
"step": 2094
},
{
"epoch": 2.2567324955116694,
"grad_norm": 0.27945080399513245,
"learning_rate": 1.7533558276381351e-06,
"loss": 0.3461,
"step": 2095
},
{
"epoch": 2.257809694793537,
"grad_norm": 0.296705424785614,
"learning_rate": 1.7485895086272903e-06,
"loss": 0.3794,
"step": 2096
},
{
"epoch": 2.258886894075404,
"grad_norm": 0.31906142830848694,
"learning_rate": 1.7438283035611847e-06,
"loss": 0.3702,
"step": 2097
},
{
"epoch": 2.2599640933572713,
"grad_norm": 0.29481953382492065,
"learning_rate": 1.7390722199284287e-06,
"loss": 0.3912,
"step": 2098
},
{
"epoch": 2.2610412926391383,
"grad_norm": 0.27198755741119385,
"learning_rate": 1.734321265209572e-06,
"loss": 0.3504,
"step": 2099
},
{
"epoch": 2.2621184919210053,
"grad_norm": 0.2756825089454651,
"learning_rate": 1.7295754468771026e-06,
"loss": 0.4126,
"step": 2100
},
{
"epoch": 2.2631956912028723,
"grad_norm": 0.28191620111465454,
"learning_rate": 1.724834772395428e-06,
"loss": 0.3646,
"step": 2101
},
{
"epoch": 2.26427289048474,
"grad_norm": 0.2847255766391754,
"learning_rate": 1.7200992492208647e-06,
"loss": 0.3734,
"step": 2102
},
{
"epoch": 2.265350089766607,
"grad_norm": 0.31416910886764526,
"learning_rate": 1.7153688848016277e-06,
"loss": 0.3747,
"step": 2103
},
{
"epoch": 2.266427289048474,
"grad_norm": 0.30127301812171936,
"learning_rate": 1.7106436865778182e-06,
"loss": 0.3609,
"step": 2104
},
{
"epoch": 2.2675044883303412,
"grad_norm": 0.283171683549881,
"learning_rate": 1.7059236619814128e-06,
"loss": 0.3874,
"step": 2105
},
{
"epoch": 2.2685816876122082,
"grad_norm": 0.28598931431770325,
"learning_rate": 1.7012088184362469e-06,
"loss": 0.3628,
"step": 2106
},
{
"epoch": 2.2696588868940752,
"grad_norm": 0.2925755977630615,
"learning_rate": 1.6964991633580118e-06,
"loss": 0.3387,
"step": 2107
},
{
"epoch": 2.2707360861759427,
"grad_norm": 0.2768424451351166,
"learning_rate": 1.6917947041542342e-06,
"loss": 0.3738,
"step": 2108
},
{
"epoch": 2.2718132854578097,
"grad_norm": 0.26826032996177673,
"learning_rate": 1.6870954482242707e-06,
"loss": 0.3727,
"step": 2109
},
{
"epoch": 2.2728904847396767,
"grad_norm": 0.2744354009628296,
"learning_rate": 1.6824014029592944e-06,
"loss": 0.368,
"step": 2110
},
{
"epoch": 2.273967684021544,
"grad_norm": 0.3166108727455139,
"learning_rate": 1.6777125757422813e-06,
"loss": 0.3999,
"step": 2111
},
{
"epoch": 2.275044883303411,
"grad_norm": 0.27384570240974426,
"learning_rate": 1.6730289739480015e-06,
"loss": 0.3329,
"step": 2112
},
{
"epoch": 2.276122082585278,
"grad_norm": 0.3060706555843353,
"learning_rate": 1.668350604943006e-06,
"loss": 0.4192,
"step": 2113
},
{
"epoch": 2.2771992818671456,
"grad_norm": 0.279412180185318,
"learning_rate": 1.663677476085616e-06,
"loss": 0.3569,
"step": 2114
},
{
"epoch": 2.2782764811490126,
"grad_norm": 0.30697518587112427,
"learning_rate": 1.6590095947259083e-06,
"loss": 0.3993,
"step": 2115
},
{
"epoch": 2.2793536804308796,
"grad_norm": 0.27257391810417175,
"learning_rate": 1.6543469682057105e-06,
"loss": 0.3356,
"step": 2116
},
{
"epoch": 2.280430879712747,
"grad_norm": 0.2825731337070465,
"learning_rate": 1.6496896038585796e-06,
"loss": 0.3501,
"step": 2117
},
{
"epoch": 2.281508078994614,
"grad_norm": 0.2653917074203491,
"learning_rate": 1.6450375090098003e-06,
"loss": 0.3694,
"step": 2118
},
{
"epoch": 2.282585278276481,
"grad_norm": 0.2905532121658325,
"learning_rate": 1.6403906909763688e-06,
"loss": 0.368,
"step": 2119
},
{
"epoch": 2.283662477558348,
"grad_norm": 0.2730623781681061,
"learning_rate": 1.6357491570669814e-06,
"loss": 0.3547,
"step": 2120
},
{
"epoch": 2.2847396768402155,
"grad_norm": 0.2855893671512604,
"learning_rate": 1.631112914582022e-06,
"loss": 0.3762,
"step": 2121
},
{
"epoch": 2.2858168761220825,
"grad_norm": 0.2811420261859894,
"learning_rate": 1.6264819708135549e-06,
"loss": 0.3816,
"step": 2122
},
{
"epoch": 2.28689407540395,
"grad_norm": 0.33978739380836487,
"learning_rate": 1.6218563330453052e-06,
"loss": 0.3769,
"step": 2123
},
{
"epoch": 2.287971274685817,
"grad_norm": 0.27539363503456116,
"learning_rate": 1.6172360085526567e-06,
"loss": 0.3776,
"step": 2124
},
{
"epoch": 2.289048473967684,
"grad_norm": 0.28592661023139954,
"learning_rate": 1.6126210046026364e-06,
"loss": 0.381,
"step": 2125
},
{
"epoch": 2.290125673249551,
"grad_norm": 0.28629782795906067,
"learning_rate": 1.6080113284539011e-06,
"loss": 0.3909,
"step": 2126
},
{
"epoch": 2.2912028725314184,
"grad_norm": 0.28644952178001404,
"learning_rate": 1.6034069873567305e-06,
"loss": 0.3965,
"step": 2127
},
{
"epoch": 2.2922800718132854,
"grad_norm": 0.2881433665752411,
"learning_rate": 1.5988079885530073e-06,
"loss": 0.3642,
"step": 2128
},
{
"epoch": 2.2933572710951524,
"grad_norm": 0.2940407395362854,
"learning_rate": 1.5942143392762178e-06,
"loss": 0.3544,
"step": 2129
},
{
"epoch": 2.29443447037702,
"grad_norm": 0.29156753420829773,
"learning_rate": 1.5896260467514335e-06,
"loss": 0.3703,
"step": 2130
},
{
"epoch": 2.295511669658887,
"grad_norm": 0.29485079646110535,
"learning_rate": 1.5850431181952953e-06,
"loss": 0.4186,
"step": 2131
},
{
"epoch": 2.296588868940754,
"grad_norm": 0.27262139320373535,
"learning_rate": 1.5804655608160135e-06,
"loss": 0.3508,
"step": 2132
},
{
"epoch": 2.2976660682226213,
"grad_norm": 0.2714279592037201,
"learning_rate": 1.5758933818133482e-06,
"loss": 0.3392,
"step": 2133
},
{
"epoch": 2.2987432675044883,
"grad_norm": 0.29111814498901367,
"learning_rate": 1.5713265883786e-06,
"loss": 0.3844,
"step": 2134
},
{
"epoch": 2.2998204667863553,
"grad_norm": 0.2989579439163208,
"learning_rate": 1.5667651876945994e-06,
"loss": 0.3799,
"step": 2135
},
{
"epoch": 2.3008976660682228,
"grad_norm": 0.29087311029434204,
"learning_rate": 1.5622091869356937e-06,
"loss": 0.3532,
"step": 2136
},
{
"epoch": 2.3019748653500898,
"grad_norm": 0.29848843812942505,
"learning_rate": 1.5576585932677407e-06,
"loss": 0.4171,
"step": 2137
},
{
"epoch": 2.3030520646319568,
"grad_norm": 0.2713443338871002,
"learning_rate": 1.5531134138480863e-06,
"loss": 0.3645,
"step": 2138
},
{
"epoch": 2.304129263913824,
"grad_norm": 0.27279627323150635,
"learning_rate": 1.54857365582557e-06,
"loss": 0.3689,
"step": 2139
},
{
"epoch": 2.3052064631956912,
"grad_norm": 0.28853660821914673,
"learning_rate": 1.544039326340495e-06,
"loss": 0.3914,
"step": 2140
},
{
"epoch": 2.3062836624775582,
"grad_norm": 0.287783682346344,
"learning_rate": 1.5395104325246336e-06,
"loss": 0.3859,
"step": 2141
},
{
"epoch": 2.3073608617594257,
"grad_norm": 0.30437979102134705,
"learning_rate": 1.5349869815012053e-06,
"loss": 0.3802,
"step": 2142
},
{
"epoch": 2.3084380610412927,
"grad_norm": 0.2932701110839844,
"learning_rate": 1.5304689803848699e-06,
"loss": 0.4123,
"step": 2143
},
{
"epoch": 2.3095152603231597,
"grad_norm": 0.2476838082075119,
"learning_rate": 1.5259564362817147e-06,
"loss": 0.3413,
"step": 2144
},
{
"epoch": 2.3105924596050267,
"grad_norm": 0.27603915333747864,
"learning_rate": 1.521449356289245e-06,
"loss": 0.3702,
"step": 2145
},
{
"epoch": 2.311669658886894,
"grad_norm": 0.26957961916923523,
"learning_rate": 1.5169477474963722e-06,
"loss": 0.3523,
"step": 2146
},
{
"epoch": 2.312746858168761,
"grad_norm": 0.2828165888786316,
"learning_rate": 1.512451616983399e-06,
"loss": 0.3947,
"step": 2147
},
{
"epoch": 2.3138240574506286,
"grad_norm": 0.2874089777469635,
"learning_rate": 1.5079609718220167e-06,
"loss": 0.4005,
"step": 2148
},
{
"epoch": 2.3149012567324956,
"grad_norm": 0.2528451085090637,
"learning_rate": 1.5034758190752836e-06,
"loss": 0.3726,
"step": 2149
},
{
"epoch": 2.3159784560143626,
"grad_norm": 0.27315106987953186,
"learning_rate": 1.4989961657976237e-06,
"loss": 0.3915,
"step": 2150
},
{
"epoch": 2.3170556552962296,
"grad_norm": 0.2935149371623993,
"learning_rate": 1.4945220190348103e-06,
"loss": 0.3522,
"step": 2151
},
{
"epoch": 2.318132854578097,
"grad_norm": 0.30114105343818665,
"learning_rate": 1.4900533858239542e-06,
"loss": 0.3763,
"step": 2152
},
{
"epoch": 2.319210053859964,
"grad_norm": 0.2741653025150299,
"learning_rate": 1.4855902731934962e-06,
"loss": 0.3671,
"step": 2153
},
{
"epoch": 2.3202872531418315,
"grad_norm": 0.28554973006248474,
"learning_rate": 1.4811326881631937e-06,
"loss": 0.3763,
"step": 2154
},
{
"epoch": 2.3213644524236985,
"grad_norm": 0.26286450028419495,
"learning_rate": 1.4766806377441078e-06,
"loss": 0.369,
"step": 2155
},
{
"epoch": 2.3224416517055655,
"grad_norm": 0.28501445055007935,
"learning_rate": 1.4722341289385978e-06,
"loss": 0.3913,
"step": 2156
},
{
"epoch": 2.3235188509874325,
"grad_norm": 0.2689116597175598,
"learning_rate": 1.4677931687403046e-06,
"loss": 0.3667,
"step": 2157
},
{
"epoch": 2.3245960502693,
"grad_norm": 0.2794966697692871,
"learning_rate": 1.4633577641341445e-06,
"loss": 0.3657,
"step": 2158
},
{
"epoch": 2.325673249551167,
"grad_norm": 0.2858525514602661,
"learning_rate": 1.4589279220962922e-06,
"loss": 0.3643,
"step": 2159
},
{
"epoch": 2.326750448833034,
"grad_norm": 0.28977170586586,
"learning_rate": 1.454503649594176e-06,
"loss": 0.3835,
"step": 2160
},
{
"epoch": 2.3278276481149014,
"grad_norm": 0.2896460294723511,
"learning_rate": 1.4500849535864636e-06,
"loss": 0.3782,
"step": 2161
},
{
"epoch": 2.3289048473967684,
"grad_norm": 0.30394282937049866,
"learning_rate": 1.4456718410230541e-06,
"loss": 0.3838,
"step": 2162
},
{
"epoch": 2.3299820466786354,
"grad_norm": 0.28289011120796204,
"learning_rate": 1.4412643188450581e-06,
"loss": 0.3606,
"step": 2163
},
{
"epoch": 2.331059245960503,
"grad_norm": 0.27398374676704407,
"learning_rate": 1.4368623939848003e-06,
"loss": 0.3863,
"step": 2164
},
{
"epoch": 2.33213644524237,
"grad_norm": 0.2954217493534088,
"learning_rate": 1.4324660733657985e-06,
"loss": 0.3795,
"step": 2165
},
{
"epoch": 2.333213644524237,
"grad_norm": 0.2950679361820221,
"learning_rate": 1.4280753639027567e-06,
"loss": 0.4029,
"step": 2166
},
{
"epoch": 2.3342908438061043,
"grad_norm": 0.2834983766078949,
"learning_rate": 1.4236902725015533e-06,
"loss": 0.3542,
"step": 2167
},
{
"epoch": 2.3353680430879713,
"grad_norm": 0.3374176323413849,
"learning_rate": 1.4193108060592308e-06,
"loss": 0.3786,
"step": 2168
},
{
"epoch": 2.3364452423698383,
"grad_norm": 0.29156172275543213,
"learning_rate": 1.4149369714639856e-06,
"loss": 0.386,
"step": 2169
},
{
"epoch": 2.3375224416517058,
"grad_norm": 0.29786962270736694,
"learning_rate": 1.4105687755951508e-06,
"loss": 0.4045,
"step": 2170
},
{
"epoch": 2.3385996409335728,
"grad_norm": 0.2667715847492218,
"learning_rate": 1.4062062253231983e-06,
"loss": 0.3425,
"step": 2171
},
{
"epoch": 2.3396768402154398,
"grad_norm": 0.31166592240333557,
"learning_rate": 1.401849327509714e-06,
"loss": 0.3904,
"step": 2172
},
{
"epoch": 2.340754039497307,
"grad_norm": 0.28892043232917786,
"learning_rate": 1.3974980890073968e-06,
"loss": 0.3565,
"step": 2173
},
{
"epoch": 2.341831238779174,
"grad_norm": 0.3395155072212219,
"learning_rate": 1.3931525166600447e-06,
"loss": 0.3847,
"step": 2174
},
{
"epoch": 2.342908438061041,
"grad_norm": 0.302751362323761,
"learning_rate": 1.3888126173025412e-06,
"loss": 0.3638,
"step": 2175
},
{
"epoch": 2.343985637342908,
"grad_norm": 0.2783910930156708,
"learning_rate": 1.3844783977608494e-06,
"loss": 0.3612,
"step": 2176
},
{
"epoch": 2.3450628366247757,
"grad_norm": 0.291925311088562,
"learning_rate": 1.3801498648519984e-06,
"loss": 0.3712,
"step": 2177
},
{
"epoch": 2.3461400359066427,
"grad_norm": 0.2917797863483429,
"learning_rate": 1.3758270253840745e-06,
"loss": 0.3646,
"step": 2178
},
{
"epoch": 2.34721723518851,
"grad_norm": 0.303262323141098,
"learning_rate": 1.371509886156206e-06,
"loss": 0.3972,
"step": 2179
},
{
"epoch": 2.348294434470377,
"grad_norm": 0.2622956335544586,
"learning_rate": 1.3671984539585548e-06,
"loss": 0.3108,
"step": 2180
},
{
"epoch": 2.349371633752244,
"grad_norm": 0.3135432302951813,
"learning_rate": 1.3628927355723115e-06,
"loss": 0.4323,
"step": 2181
},
{
"epoch": 2.350448833034111,
"grad_norm": 0.3027063310146332,
"learning_rate": 1.3585927377696766e-06,
"loss": 0.3626,
"step": 2182
},
{
"epoch": 2.3515260323159786,
"grad_norm": 0.2851846218109131,
"learning_rate": 1.3542984673138542e-06,
"loss": 0.3655,
"step": 2183
},
{
"epoch": 2.3526032315978456,
"grad_norm": 0.30676692724227905,
"learning_rate": 1.3500099309590397e-06,
"loss": 0.3758,
"step": 2184
},
{
"epoch": 2.3536804308797126,
"grad_norm": 0.2884361743927002,
"learning_rate": 1.3457271354504097e-06,
"loss": 0.3817,
"step": 2185
},
{
"epoch": 2.35475763016158,
"grad_norm": 0.28114891052246094,
"learning_rate": 1.341450087524112e-06,
"loss": 0.3488,
"step": 2186
},
{
"epoch": 2.355834829443447,
"grad_norm": 0.3103155791759491,
"learning_rate": 1.3371787939072523e-06,
"loss": 0.4043,
"step": 2187
},
{
"epoch": 2.356912028725314,
"grad_norm": 0.27932849526405334,
"learning_rate": 1.332913261317887e-06,
"loss": 0.3594,
"step": 2188
},
{
"epoch": 2.3579892280071815,
"grad_norm": 0.26250553131103516,
"learning_rate": 1.3286534964650121e-06,
"loss": 0.3657,
"step": 2189
},
{
"epoch": 2.3590664272890485,
"grad_norm": 0.2756526470184326,
"learning_rate": 1.3243995060485537e-06,
"loss": 0.3922,
"step": 2190
},
{
"epoch": 2.3601436265709155,
"grad_norm": 0.24424968659877777,
"learning_rate": 1.3201512967593487e-06,
"loss": 0.368,
"step": 2191
},
{
"epoch": 2.361220825852783,
"grad_norm": 0.284087598323822,
"learning_rate": 1.3159088752791483e-06,
"loss": 0.3749,
"step": 2192
},
{
"epoch": 2.36229802513465,
"grad_norm": 0.2854723334312439,
"learning_rate": 1.3116722482805972e-06,
"loss": 0.3817,
"step": 2193
},
{
"epoch": 2.363375224416517,
"grad_norm": 0.2786845564842224,
"learning_rate": 1.3074414224272287e-06,
"loss": 0.4066,
"step": 2194
},
{
"epoch": 2.3644524236983844,
"grad_norm": 0.25960221886634827,
"learning_rate": 1.303216404373447e-06,
"loss": 0.3488,
"step": 2195
},
{
"epoch": 2.3655296229802514,
"grad_norm": 0.2924199104309082,
"learning_rate": 1.2989972007645262e-06,
"loss": 0.3796,
"step": 2196
},
{
"epoch": 2.3666068222621184,
"grad_norm": 0.2700774073600769,
"learning_rate": 1.2947838182365941e-06,
"loss": 0.3809,
"step": 2197
},
{
"epoch": 2.367684021543986,
"grad_norm": 0.2660199701786041,
"learning_rate": 1.2905762634166214e-06,
"loss": 0.3645,
"step": 2198
},
{
"epoch": 2.368761220825853,
"grad_norm": 0.28819847106933594,
"learning_rate": 1.2863745429224145e-06,
"loss": 0.3933,
"step": 2199
},
{
"epoch": 2.36983842010772,
"grad_norm": 0.28640761971473694,
"learning_rate": 1.2821786633626038e-06,
"loss": 0.3623,
"step": 2200
},
{
"epoch": 2.370915619389587,
"grad_norm": 0.2778407335281372,
"learning_rate": 1.2779886313366291e-06,
"loss": 0.3997,
"step": 2201
},
{
"epoch": 2.3719928186714543,
"grad_norm": 0.30692175030708313,
"learning_rate": 1.2738044534347366e-06,
"loss": 0.3731,
"step": 2202
},
{
"epoch": 2.3730700179533213,
"grad_norm": 0.27536556124687195,
"learning_rate": 1.2696261362379653e-06,
"loss": 0.3854,
"step": 2203
},
{
"epoch": 2.3741472172351887,
"grad_norm": 0.283770352602005,
"learning_rate": 1.2654536863181328e-06,
"loss": 0.3499,
"step": 2204
},
{
"epoch": 2.3752244165170557,
"grad_norm": 0.3203579783439636,
"learning_rate": 1.2612871102378305e-06,
"loss": 0.3899,
"step": 2205
},
{
"epoch": 2.3763016157989227,
"grad_norm": 0.2789762020111084,
"learning_rate": 1.2571264145504125e-06,
"loss": 0.3474,
"step": 2206
},
{
"epoch": 2.3773788150807897,
"grad_norm": 0.27259716391563416,
"learning_rate": 1.2529716057999819e-06,
"loss": 0.3495,
"step": 2207
},
{
"epoch": 2.378456014362657,
"grad_norm": 0.30078408122062683,
"learning_rate": 1.248822690521383e-06,
"loss": 0.381,
"step": 2208
},
{
"epoch": 2.379533213644524,
"grad_norm": 0.29212328791618347,
"learning_rate": 1.2446796752401912e-06,
"loss": 0.4109,
"step": 2209
},
{
"epoch": 2.380610412926391,
"grad_norm": 0.26162827014923096,
"learning_rate": 1.2405425664727044e-06,
"loss": 0.3562,
"step": 2210
},
{
"epoch": 2.3816876122082586,
"grad_norm": 0.26936033368110657,
"learning_rate": 1.2364113707259251e-06,
"loss": 0.3897,
"step": 2211
},
{
"epoch": 2.3827648114901256,
"grad_norm": 0.29068368673324585,
"learning_rate": 1.2322860944975573e-06,
"loss": 0.3808,
"step": 2212
},
{
"epoch": 2.3838420107719926,
"grad_norm": 0.285015344619751,
"learning_rate": 1.2281667442759977e-06,
"loss": 0.3758,
"step": 2213
},
{
"epoch": 2.38491921005386,
"grad_norm": 0.2871081531047821,
"learning_rate": 1.22405332654032e-06,
"loss": 0.4035,
"step": 2214
},
{
"epoch": 2.385996409335727,
"grad_norm": 0.3040642738342285,
"learning_rate": 1.219945847760267e-06,
"loss": 0.4132,
"step": 2215
},
{
"epoch": 2.387073608617594,
"grad_norm": 0.26370567083358765,
"learning_rate": 1.2158443143962423e-06,
"loss": 0.3389,
"step": 2216
},
{
"epoch": 2.3881508078994615,
"grad_norm": 0.2896377146244049,
"learning_rate": 1.2117487328992954e-06,
"loss": 0.3861,
"step": 2217
},
{
"epoch": 2.3892280071813286,
"grad_norm": 0.2631506621837616,
"learning_rate": 1.2076591097111184e-06,
"loss": 0.3681,
"step": 2218
},
{
"epoch": 2.3903052064631956,
"grad_norm": 0.27425822615623474,
"learning_rate": 1.2035754512640263e-06,
"loss": 0.3973,
"step": 2219
},
{
"epoch": 2.391382405745063,
"grad_norm": 0.2854865491390228,
"learning_rate": 1.1994977639809575e-06,
"loss": 0.3818,
"step": 2220
},
{
"epoch": 2.39245960502693,
"grad_norm": 0.29627755284309387,
"learning_rate": 1.1954260542754575e-06,
"loss": 0.3397,
"step": 2221
},
{
"epoch": 2.393536804308797,
"grad_norm": 0.27232253551483154,
"learning_rate": 1.191360328551668e-06,
"loss": 0.388,
"step": 2222
},
{
"epoch": 2.3946140035906645,
"grad_norm": 0.26810529828071594,
"learning_rate": 1.1873005932043202e-06,
"loss": 0.3534,
"step": 2223
},
{
"epoch": 2.3956912028725315,
"grad_norm": 0.3001575171947479,
"learning_rate": 1.1832468546187248e-06,
"loss": 0.3726,
"step": 2224
},
{
"epoch": 2.3967684021543985,
"grad_norm": 0.28122803568840027,
"learning_rate": 1.179199119170759e-06,
"loss": 0.3838,
"step": 2225
},
{
"epoch": 2.3978456014362655,
"grad_norm": 0.2687775194644928,
"learning_rate": 1.175157393226859e-06,
"loss": 0.4249,
"step": 2226
},
{
"epoch": 2.398922800718133,
"grad_norm": 0.2854040861129761,
"learning_rate": 1.1711216831440086e-06,
"loss": 0.374,
"step": 2227
},
{
"epoch": 2.4,
"grad_norm": 0.27205950021743774,
"learning_rate": 1.1670919952697267e-06,
"loss": 0.3657,
"step": 2228
},
{
"epoch": 2.4010771992818674,
"grad_norm": 0.2628454267978668,
"learning_rate": 1.1630683359420653e-06,
"loss": 0.3695,
"step": 2229
},
{
"epoch": 2.4021543985637344,
"grad_norm": 0.30919772386550903,
"learning_rate": 1.1590507114895915e-06,
"loss": 0.3898,
"step": 2230
},
{
"epoch": 2.4032315978456014,
"grad_norm": 0.28211405873298645,
"learning_rate": 1.1550391282313817e-06,
"loss": 0.358,
"step": 2231
},
{
"epoch": 2.4043087971274684,
"grad_norm": 0.27124226093292236,
"learning_rate": 1.1510335924770106e-06,
"loss": 0.3383,
"step": 2232
},
{
"epoch": 2.405385996409336,
"grad_norm": 0.26909205317497253,
"learning_rate": 1.1470341105265375e-06,
"loss": 0.3386,
"step": 2233
},
{
"epoch": 2.406463195691203,
"grad_norm": 0.31158143281936646,
"learning_rate": 1.1430406886705053e-06,
"loss": 0.3917,
"step": 2234
},
{
"epoch": 2.4075403949730703,
"grad_norm": 0.2712906002998352,
"learning_rate": 1.1390533331899235e-06,
"loss": 0.3817,
"step": 2235
},
{
"epoch": 2.4086175942549373,
"grad_norm": 0.27246353030204773,
"learning_rate": 1.1350720503562574e-06,
"loss": 0.3796,
"step": 2236
},
{
"epoch": 2.4096947935368043,
"grad_norm": 0.26785191893577576,
"learning_rate": 1.1310968464314249e-06,
"loss": 0.367,
"step": 2237
},
{
"epoch": 2.4107719928186713,
"grad_norm": 0.2866846024990082,
"learning_rate": 1.1271277276677805e-06,
"loss": 0.3888,
"step": 2238
},
{
"epoch": 2.4118491921005387,
"grad_norm": 0.2787204384803772,
"learning_rate": 1.1231647003081092e-06,
"loss": 0.3736,
"step": 2239
},
{
"epoch": 2.4129263913824057,
"grad_norm": 0.2773076295852661,
"learning_rate": 1.119207770585614e-06,
"loss": 0.3504,
"step": 2240
},
{
"epoch": 2.4140035906642727,
"grad_norm": 0.2891734540462494,
"learning_rate": 1.1152569447239076e-06,
"loss": 0.4098,
"step": 2241
},
{
"epoch": 2.41508078994614,
"grad_norm": 0.28226038813591003,
"learning_rate": 1.1113122289370037e-06,
"loss": 0.3683,
"step": 2242
},
{
"epoch": 2.416157989228007,
"grad_norm": 0.28407686948776245,
"learning_rate": 1.1073736294293035e-06,
"loss": 0.3708,
"step": 2243
},
{
"epoch": 2.417235188509874,
"grad_norm": 0.2887764871120453,
"learning_rate": 1.103441152395588e-06,
"loss": 0.3508,
"step": 2244
},
{
"epoch": 2.4183123877917416,
"grad_norm": 0.28351515531539917,
"learning_rate": 1.0995148040210108e-06,
"loss": 0.3838,
"step": 2245
},
{
"epoch": 2.4193895870736086,
"grad_norm": 0.28088366985321045,
"learning_rate": 1.0955945904810855e-06,
"loss": 0.3517,
"step": 2246
},
{
"epoch": 2.4204667863554756,
"grad_norm": 0.2925315201282501,
"learning_rate": 1.0916805179416762e-06,
"loss": 0.3881,
"step": 2247
},
{
"epoch": 2.421543985637343,
"grad_norm": 0.2957456409931183,
"learning_rate": 1.0877725925589883e-06,
"loss": 0.3794,
"step": 2248
},
{
"epoch": 2.42262118491921,
"grad_norm": 0.2947244942188263,
"learning_rate": 1.0838708204795584e-06,
"loss": 0.385,
"step": 2249
},
{
"epoch": 2.423698384201077,
"grad_norm": 0.29007092118263245,
"learning_rate": 1.079975207840247e-06,
"loss": 0.35,
"step": 2250
},
{
"epoch": 2.4247755834829445,
"grad_norm": 0.2962624132633209,
"learning_rate": 1.0760857607682218e-06,
"loss": 0.377,
"step": 2251
},
{
"epoch": 2.4258527827648115,
"grad_norm": 0.29235324263572693,
"learning_rate": 1.0722024853809576e-06,
"loss": 0.4015,
"step": 2252
},
{
"epoch": 2.4269299820466785,
"grad_norm": 0.2606956660747528,
"learning_rate": 1.0683253877862226e-06,
"loss": 0.3588,
"step": 2253
},
{
"epoch": 2.428007181328546,
"grad_norm": 0.26373934745788574,
"learning_rate": 1.064454474082064e-06,
"loss": 0.3628,
"step": 2254
},
{
"epoch": 2.429084380610413,
"grad_norm": 0.2741117775440216,
"learning_rate": 1.0605897503568058e-06,
"loss": 0.3531,
"step": 2255
},
{
"epoch": 2.43016157989228,
"grad_norm": 0.30139395594596863,
"learning_rate": 1.0567312226890365e-06,
"loss": 0.4021,
"step": 2256
},
{
"epoch": 2.431238779174147,
"grad_norm": 0.2639596462249756,
"learning_rate": 1.0528788971475973e-06,
"loss": 0.3396,
"step": 2257
},
{
"epoch": 2.4323159784560144,
"grad_norm": 0.2918408215045929,
"learning_rate": 1.0490327797915767e-06,
"loss": 0.3917,
"step": 2258
},
{
"epoch": 2.4333931777378814,
"grad_norm": 0.2750086486339569,
"learning_rate": 1.045192876670298e-06,
"loss": 0.375,
"step": 2259
},
{
"epoch": 2.434470377019749,
"grad_norm": 0.2692476511001587,
"learning_rate": 1.041359193823307e-06,
"loss": 0.3554,
"step": 2260
},
{
"epoch": 2.435547576301616,
"grad_norm": 0.263949990272522,
"learning_rate": 1.0375317372803711e-06,
"loss": 0.3842,
"step": 2261
},
{
"epoch": 2.436624775583483,
"grad_norm": 0.29045379161834717,
"learning_rate": 1.0337105130614627e-06,
"loss": 0.3828,
"step": 2262
},
{
"epoch": 2.43770197486535,
"grad_norm": 0.2991545498371124,
"learning_rate": 1.0298955271767513e-06,
"loss": 0.3536,
"step": 2263
},
{
"epoch": 2.4387791741472173,
"grad_norm": 0.3013278543949127,
"learning_rate": 1.0260867856265967e-06,
"loss": 0.4122,
"step": 2264
},
{
"epoch": 2.4398563734290843,
"grad_norm": 0.27581077814102173,
"learning_rate": 1.0222842944015327e-06,
"loss": 0.3984,
"step": 2265
},
{
"epoch": 2.4409335727109513,
"grad_norm": 0.26801231503486633,
"learning_rate": 1.0184880594822661e-06,
"loss": 0.3426,
"step": 2266
},
{
"epoch": 2.442010771992819,
"grad_norm": 0.3020249903202057,
"learning_rate": 1.0146980868396644e-06,
"loss": 0.3663,
"step": 2267
},
{
"epoch": 2.443087971274686,
"grad_norm": 0.2986231744289398,
"learning_rate": 1.0109143824347411e-06,
"loss": 0.3812,
"step": 2268
},
{
"epoch": 2.444165170556553,
"grad_norm": 0.2531813979148865,
"learning_rate": 1.0071369522186546e-06,
"loss": 0.3607,
"step": 2269
},
{
"epoch": 2.4452423698384202,
"grad_norm": 0.2960926294326782,
"learning_rate": 1.0033658021326947e-06,
"loss": 0.3886,
"step": 2270
},
{
"epoch": 2.4463195691202873,
"grad_norm": 0.27651941776275635,
"learning_rate": 9.996009381082717e-07,
"loss": 0.3885,
"step": 2271
},
{
"epoch": 2.4473967684021543,
"grad_norm": 0.28147345781326294,
"learning_rate": 9.95842366066911e-07,
"loss": 0.3872,
"step": 2272
},
{
"epoch": 2.4484739676840217,
"grad_norm": 0.2894003987312317,
"learning_rate": 9.920900919202398e-07,
"loss": 0.3668,
"step": 2273
},
{
"epoch": 2.4495511669658887,
"grad_norm": 0.29143601655960083,
"learning_rate": 9.883441215699824e-07,
"loss": 0.3959,
"step": 2274
},
{
"epoch": 2.4506283662477557,
"grad_norm": 0.27094566822052,
"learning_rate": 9.846044609079454e-07,
"loss": 0.3823,
"step": 2275
},
{
"epoch": 2.451705565529623,
"grad_norm": 0.2588837146759033,
"learning_rate": 9.808711158160105e-07,
"loss": 0.3565,
"step": 2276
},
{
"epoch": 2.45278276481149,
"grad_norm": 0.2719583809375763,
"learning_rate": 9.7714409216613e-07,
"loss": 0.4037,
"step": 2277
},
{
"epoch": 2.453859964093357,
"grad_norm": 0.30284786224365234,
"learning_rate": 9.734233958203109e-07,
"loss": 0.3905,
"step": 2278
},
{
"epoch": 2.4549371633752246,
"grad_norm": 0.27402400970458984,
"learning_rate": 9.697090326306096e-07,
"loss": 0.3302,
"step": 2279
},
{
"epoch": 2.4560143626570916,
"grad_norm": 0.2703023850917816,
"learning_rate": 9.660010084391197e-07,
"loss": 0.3708,
"step": 2280
},
{
"epoch": 2.4570915619389586,
"grad_norm": 0.2773868143558502,
"learning_rate": 9.622993290779665e-07,
"loss": 0.3925,
"step": 2281
},
{
"epoch": 2.4581687612208256,
"grad_norm": 0.28041738271713257,
"learning_rate": 9.586040003692965e-07,
"loss": 0.3696,
"step": 2282
},
{
"epoch": 2.459245960502693,
"grad_norm": 0.28617560863494873,
"learning_rate": 9.549150281252633e-07,
"loss": 0.3695,
"step": 2283
},
{
"epoch": 2.46032315978456,
"grad_norm": 0.272877037525177,
"learning_rate": 9.51232418148027e-07,
"loss": 0.3525,
"step": 2284
},
{
"epoch": 2.4614003590664275,
"grad_norm": 0.2908807396888733,
"learning_rate": 9.475561762297414e-07,
"loss": 0.3659,
"step": 2285
},
{
"epoch": 2.4624775583482945,
"grad_norm": 0.30039674043655396,
"learning_rate": 9.438863081525396e-07,
"loss": 0.3795,
"step": 2286
},
{
"epoch": 2.4635547576301615,
"grad_norm": 0.2751771807670593,
"learning_rate": 9.402228196885343e-07,
"loss": 0.3942,
"step": 2287
},
{
"epoch": 2.4646319569120285,
"grad_norm": 0.27453452348709106,
"learning_rate": 9.365657165998021e-07,
"loss": 0.3745,
"step": 2288
},
{
"epoch": 2.465709156193896,
"grad_norm": 0.2895946502685547,
"learning_rate": 9.329150046383773e-07,
"loss": 0.3879,
"step": 2289
},
{
"epoch": 2.466786355475763,
"grad_norm": 0.2859017550945282,
"learning_rate": 9.292706895462411e-07,
"loss": 0.3745,
"step": 2290
},
{
"epoch": 2.46786355475763,
"grad_norm": 0.28885552287101746,
"learning_rate": 9.256327770553152e-07,
"loss": 0.3641,
"step": 2291
},
{
"epoch": 2.4689407540394974,
"grad_norm": 0.29805275797843933,
"learning_rate": 9.220012728874472e-07,
"loss": 0.3989,
"step": 2292
},
{
"epoch": 2.4700179533213644,
"grad_norm": 0.26521188020706177,
"learning_rate": 9.183761827544096e-07,
"loss": 0.3662,
"step": 2293
},
{
"epoch": 2.4710951526032314,
"grad_norm": 0.27879083156585693,
"learning_rate": 9.147575123578845e-07,
"loss": 0.3732,
"step": 2294
},
{
"epoch": 2.472172351885099,
"grad_norm": 0.2808539569377899,
"learning_rate": 9.111452673894589e-07,
"loss": 0.3924,
"step": 2295
},
{
"epoch": 2.473249551166966,
"grad_norm": 0.28251829743385315,
"learning_rate": 9.075394535306087e-07,
"loss": 0.3991,
"step": 2296
},
{
"epoch": 2.474326750448833,
"grad_norm": 0.28753092885017395,
"learning_rate": 9.039400764527001e-07,
"loss": 0.3656,
"step": 2297
},
{
"epoch": 2.4754039497307003,
"grad_norm": 0.2704468369483948,
"learning_rate": 9.003471418169734e-07,
"loss": 0.3321,
"step": 2298
},
{
"epoch": 2.4764811490125673,
"grad_norm": 0.28937360644340515,
"learning_rate": 8.967606552745361e-07,
"loss": 0.3983,
"step": 2299
},
{
"epoch": 2.4775583482944343,
"grad_norm": 0.28396716713905334,
"learning_rate": 8.93180622466352e-07,
"loss": 0.4039,
"step": 2300
},
{
"epoch": 2.478635547576302,
"grad_norm": 0.26640549302101135,
"learning_rate": 8.896070490232361e-07,
"loss": 0.3373,
"step": 2301
},
{
"epoch": 2.479712746858169,
"grad_norm": 0.298631489276886,
"learning_rate": 8.860399405658443e-07,
"loss": 0.3641,
"step": 2302
},
{
"epoch": 2.480789946140036,
"grad_norm": 0.29027625918388367,
"learning_rate": 8.824793027046636e-07,
"loss": 0.3973,
"step": 2303
},
{
"epoch": 2.4818671454219032,
"grad_norm": 0.27970612049102783,
"learning_rate": 8.789251410400024e-07,
"loss": 0.3939,
"step": 2304
},
{
"epoch": 2.4829443447037702,
"grad_norm": 0.26370441913604736,
"learning_rate": 8.753774611619853e-07,
"loss": 0.3535,
"step": 2305
},
{
"epoch": 2.4840215439856372,
"grad_norm": 0.2663581669330597,
"learning_rate": 8.718362686505422e-07,
"loss": 0.3637,
"step": 2306
},
{
"epoch": 2.4850987432675042,
"grad_norm": 0.2874703109264374,
"learning_rate": 8.68301569075396e-07,
"loss": 0.3948,
"step": 2307
},
{
"epoch": 2.4861759425493717,
"grad_norm": 0.263696551322937,
"learning_rate": 8.647733679960596e-07,
"loss": 0.3467,
"step": 2308
},
{
"epoch": 2.4872531418312387,
"grad_norm": 0.28646519780158997,
"learning_rate": 8.612516709618251e-07,
"loss": 0.4307,
"step": 2309
},
{
"epoch": 2.488330341113106,
"grad_norm": 0.24721655249595642,
"learning_rate": 8.577364835117552e-07,
"loss": 0.3457,
"step": 2310
},
{
"epoch": 2.489407540394973,
"grad_norm": 0.2845896780490875,
"learning_rate": 8.542278111746722e-07,
"loss": 0.373,
"step": 2311
},
{
"epoch": 2.49048473967684,
"grad_norm": 0.2640857696533203,
"learning_rate": 8.507256594691532e-07,
"loss": 0.3591,
"step": 2312
},
{
"epoch": 2.491561938958707,
"grad_norm": 0.28080788254737854,
"learning_rate": 8.472300339035178e-07,
"loss": 0.387,
"step": 2313
},
{
"epoch": 2.4926391382405746,
"grad_norm": 0.27218419313430786,
"learning_rate": 8.437409399758234e-07,
"loss": 0.3759,
"step": 2314
},
{
"epoch": 2.4937163375224416,
"grad_norm": 0.2938312292098999,
"learning_rate": 8.402583831738504e-07,
"loss": 0.3598,
"step": 2315
},
{
"epoch": 2.494793536804309,
"grad_norm": 0.28441736102104187,
"learning_rate": 8.367823689751009e-07,
"loss": 0.3511,
"step": 2316
},
{
"epoch": 2.495870736086176,
"grad_norm": 0.2931486964225769,
"learning_rate": 8.333129028467829e-07,
"loss": 0.4074,
"step": 2317
},
{
"epoch": 2.496947935368043,
"grad_norm": 0.2538676857948303,
"learning_rate": 8.29849990245809e-07,
"loss": 0.3539,
"step": 2318
},
{
"epoch": 2.49802513464991,
"grad_norm": 0.27099522948265076,
"learning_rate": 8.263936366187825e-07,
"loss": 0.4047,
"step": 2319
},
{
"epoch": 2.4991023339317775,
"grad_norm": 0.2580559551715851,
"learning_rate": 8.229438474019913e-07,
"loss": 0.3774,
"step": 2320
},
{
"epoch": 2.5001795332136445,
"grad_norm": 0.2654803395271301,
"learning_rate": 8.195006280213969e-07,
"loss": 0.3555,
"step": 2321
},
{
"epoch": 2.5012567324955115,
"grad_norm": 0.2501416504383087,
"learning_rate": 8.160639838926293e-07,
"loss": 0.3478,
"step": 2322
},
{
"epoch": 2.502333931777379,
"grad_norm": 0.3007952570915222,
"learning_rate": 8.126339204209765e-07,
"loss": 0.4177,
"step": 2323
},
{
"epoch": 2.503411131059246,
"grad_norm": 0.28114473819732666,
"learning_rate": 8.092104430013737e-07,
"loss": 0.3829,
"step": 2324
},
{
"epoch": 2.504488330341113,
"grad_norm": 0.2905210852622986,
"learning_rate": 8.057935570184e-07,
"loss": 0.384,
"step": 2325
},
{
"epoch": 2.5055655296229804,
"grad_norm": 0.2917923033237457,
"learning_rate": 8.023832678462667e-07,
"loss": 0.3722,
"step": 2326
},
{
"epoch": 2.5066427289048474,
"grad_norm": 0.27999940514564514,
"learning_rate": 7.989795808488098e-07,
"loss": 0.3433,
"step": 2327
},
{
"epoch": 2.5077199281867144,
"grad_norm": 0.2836602032184601,
"learning_rate": 7.955825013794793e-07,
"loss": 0.3905,
"step": 2328
},
{
"epoch": 2.508797127468582,
"grad_norm": 0.27342960238456726,
"learning_rate": 7.921920347813333e-07,
"loss": 0.3556,
"step": 2329
},
{
"epoch": 2.509874326750449,
"grad_norm": 0.2859393060207367,
"learning_rate": 7.888081863870307e-07,
"loss": 0.3864,
"step": 2330
},
{
"epoch": 2.510951526032316,
"grad_norm": 0.2840021550655365,
"learning_rate": 7.8543096151882e-07,
"loss": 0.3682,
"step": 2331
},
{
"epoch": 2.512028725314183,
"grad_norm": 0.2775287330150604,
"learning_rate": 7.820603654885301e-07,
"loss": 0.3865,
"step": 2332
},
{
"epoch": 2.5131059245960503,
"grad_norm": 0.28231337666511536,
"learning_rate": 7.786964035975658e-07,
"loss": 0.3742,
"step": 2333
},
{
"epoch": 2.5141831238779173,
"grad_norm": 0.2635897696018219,
"learning_rate": 7.753390811368972e-07,
"loss": 0.3271,
"step": 2334
},
{
"epoch": 2.5152603231597848,
"grad_norm": 0.28646957874298096,
"learning_rate": 7.719884033870523e-07,
"loss": 0.4104,
"step": 2335
},
{
"epoch": 2.5163375224416518,
"grad_norm": 0.27592933177948,
"learning_rate": 7.686443756181067e-07,
"loss": 0.3701,
"step": 2336
},
{
"epoch": 2.5174147217235188,
"grad_norm": 0.30637457966804504,
"learning_rate": 7.653070030896775e-07,
"loss": 0.3996,
"step": 2337
},
{
"epoch": 2.5184919210053858,
"grad_norm": 0.28058868646621704,
"learning_rate": 7.619762910509132e-07,
"loss": 0.356,
"step": 2338
},
{
"epoch": 2.519569120287253,
"grad_norm": 0.2630109488964081,
"learning_rate": 7.586522447404882e-07,
"loss": 0.3661,
"step": 2339
},
{
"epoch": 2.52064631956912,
"grad_norm": 0.29673612117767334,
"learning_rate": 7.553348693865897e-07,
"loss": 0.3669,
"step": 2340
},
{
"epoch": 2.5217235188509877,
"grad_norm": 0.2780836224555969,
"learning_rate": 7.520241702069158e-07,
"loss": 0.3861,
"step": 2341
},
{
"epoch": 2.5228007181328547,
"grad_norm": 0.26055651903152466,
"learning_rate": 7.487201524086629e-07,
"loss": 0.3731,
"step": 2342
},
{
"epoch": 2.5238779174147217,
"grad_norm": 0.26952463388442993,
"learning_rate": 7.454228211885184e-07,
"loss": 0.3758,
"step": 2343
},
{
"epoch": 2.5249551166965887,
"grad_norm": 0.2588259279727936,
"learning_rate": 7.421321817326527e-07,
"loss": 0.363,
"step": 2344
},
{
"epoch": 2.526032315978456,
"grad_norm": 0.29151490330696106,
"learning_rate": 7.388482392167118e-07,
"loss": 0.3748,
"step": 2345
},
{
"epoch": 2.527109515260323,
"grad_norm": 0.28330057859420776,
"learning_rate": 7.355709988058091e-07,
"loss": 0.3261,
"step": 2346
},
{
"epoch": 2.5281867145421906,
"grad_norm": 0.28915736079216003,
"learning_rate": 7.32300465654513e-07,
"loss": 0.4078,
"step": 2347
},
{
"epoch": 2.5292639138240576,
"grad_norm": 0.2703227400779724,
"learning_rate": 7.290366449068482e-07,
"loss": 0.3573,
"step": 2348
},
{
"epoch": 2.5303411131059246,
"grad_norm": 0.2650614380836487,
"learning_rate": 7.257795416962754e-07,
"loss": 0.3383,
"step": 2349
},
{
"epoch": 2.5314183123877916,
"grad_norm": 0.29982319474220276,
"learning_rate": 7.225291611456947e-07,
"loss": 0.4243,
"step": 2350
},
{
"epoch": 2.532495511669659,
"grad_norm": 0.2780900299549103,
"learning_rate": 7.19285508367431e-07,
"loss": 0.3339,
"step": 2351
},
{
"epoch": 2.533572710951526,
"grad_norm": 0.2761061489582062,
"learning_rate": 7.160485884632279e-07,
"loss": 0.3345,
"step": 2352
},
{
"epoch": 2.534649910233393,
"grad_norm": 0.30426251888275146,
"learning_rate": 7.128184065242377e-07,
"loss": 0.4213,
"step": 2353
},
{
"epoch": 2.5357271095152605,
"grad_norm": 0.26618361473083496,
"learning_rate": 7.095949676310171e-07,
"loss": 0.3556,
"step": 2354
},
{
"epoch": 2.5368043087971275,
"grad_norm": 0.2708311378955841,
"learning_rate": 7.06378276853516e-07,
"loss": 0.3681,
"step": 2355
},
{
"epoch": 2.5378815080789945,
"grad_norm": 0.27327191829681396,
"learning_rate": 7.031683392510696e-07,
"loss": 0.3747,
"step": 2356
},
{
"epoch": 2.5389587073608615,
"grad_norm": 0.2743578553199768,
"learning_rate": 6.999651598723928e-07,
"loss": 0.3714,
"step": 2357
},
{
"epoch": 2.540035906642729,
"grad_norm": 0.27341485023498535,
"learning_rate": 6.96768743755572e-07,
"loss": 0.3602,
"step": 2358
},
{
"epoch": 2.541113105924596,
"grad_norm": 0.30134743452072144,
"learning_rate": 6.935790959280525e-07,
"loss": 0.4088,
"step": 2359
},
{
"epoch": 2.5421903052064634,
"grad_norm": 0.28071579337120056,
"learning_rate": 6.903962214066367e-07,
"loss": 0.366,
"step": 2360
},
{
"epoch": 2.5432675044883304,
"grad_norm": 0.28438490629196167,
"learning_rate": 6.872201251974747e-07,
"loss": 0.3826,
"step": 2361
},
{
"epoch": 2.5443447037701974,
"grad_norm": 0.2743411660194397,
"learning_rate": 6.840508122960526e-07,
"loss": 0.3389,
"step": 2362
},
{
"epoch": 2.5454219030520644,
"grad_norm": 0.2804628908634186,
"learning_rate": 6.808882876871908e-07,
"loss": 0.3996,
"step": 2363
},
{
"epoch": 2.546499102333932,
"grad_norm": 0.27134260535240173,
"learning_rate": 6.777325563450282e-07,
"loss": 0.3802,
"step": 2364
},
{
"epoch": 2.547576301615799,
"grad_norm": 0.2600853741168976,
"learning_rate": 6.745836232330227e-07,
"loss": 0.352,
"step": 2365
},
{
"epoch": 2.5486535008976663,
"grad_norm": 0.271705687046051,
"learning_rate": 6.714414933039398e-07,
"loss": 0.3787,
"step": 2366
},
{
"epoch": 2.5497307001795333,
"grad_norm": 0.27931052446365356,
"learning_rate": 6.683061714998418e-07,
"loss": 0.3745,
"step": 2367
},
{
"epoch": 2.5508078994614003,
"grad_norm": 0.2835497558116913,
"learning_rate": 6.651776627520856e-07,
"loss": 0.336,
"step": 2368
},
{
"epoch": 2.5518850987432673,
"grad_norm": 0.3035619854927063,
"learning_rate": 6.62055971981313e-07,
"loss": 0.3856,
"step": 2369
},
{
"epoch": 2.5529622980251347,
"grad_norm": 0.27398329973220825,
"learning_rate": 6.589411040974369e-07,
"loss": 0.3663,
"step": 2370
},
{
"epoch": 2.5540394973070017,
"grad_norm": 0.28608840703964233,
"learning_rate": 6.558330639996457e-07,
"loss": 0.3688,
"step": 2371
},
{
"epoch": 2.555116696588869,
"grad_norm": 0.30372244119644165,
"learning_rate": 6.527318565763829e-07,
"loss": 0.3981,
"step": 2372
},
{
"epoch": 2.556193895870736,
"grad_norm": 0.27598169445991516,
"learning_rate": 6.496374867053496e-07,
"loss": 0.3459,
"step": 2373
},
{
"epoch": 2.557271095152603,
"grad_norm": 0.28201863169670105,
"learning_rate": 6.465499592534902e-07,
"loss": 0.3984,
"step": 2374
},
{
"epoch": 2.55834829443447,
"grad_norm": 0.27516603469848633,
"learning_rate": 6.434692790769886e-07,
"loss": 0.3742,
"step": 2375
},
{
"epoch": 2.5594254937163377,
"grad_norm": 0.2833200693130493,
"learning_rate": 6.403954510212585e-07,
"loss": 0.354,
"step": 2376
},
{
"epoch": 2.5605026929982047,
"grad_norm": 0.2715960741043091,
"learning_rate": 6.373284799209351e-07,
"loss": 0.3626,
"step": 2377
},
{
"epoch": 2.5615798922800717,
"grad_norm": 0.27197495102882385,
"learning_rate": 6.342683705998714e-07,
"loss": 0.3797,
"step": 2378
},
{
"epoch": 2.562657091561939,
"grad_norm": 0.2932741343975067,
"learning_rate": 6.312151278711237e-07,
"loss": 0.4351,
"step": 2379
},
{
"epoch": 2.563734290843806,
"grad_norm": 0.29426777362823486,
"learning_rate": 6.281687565369537e-07,
"loss": 0.3912,
"step": 2380
},
{
"epoch": 2.564811490125673,
"grad_norm": 0.309602826833725,
"learning_rate": 6.251292613888094e-07,
"loss": 0.4179,
"step": 2381
},
{
"epoch": 2.5658886894075406,
"grad_norm": 0.2364678680896759,
"learning_rate": 6.220966472073286e-07,
"loss": 0.324,
"step": 2382
},
{
"epoch": 2.5669658886894076,
"grad_norm": 0.2689834237098694,
"learning_rate": 6.190709187623245e-07,
"loss": 0.3866,
"step": 2383
},
{
"epoch": 2.5680430879712746,
"grad_norm": 0.27198678255081177,
"learning_rate": 6.160520808127807e-07,
"loss": 0.3846,
"step": 2384
},
{
"epoch": 2.569120287253142,
"grad_norm": 0.2812858521938324,
"learning_rate": 6.130401381068424e-07,
"loss": 0.3881,
"step": 2385
},
{
"epoch": 2.570197486535009,
"grad_norm": 0.2781459093093872,
"learning_rate": 6.100350953818102e-07,
"loss": 0.3525,
"step": 2386
},
{
"epoch": 2.571274685816876,
"grad_norm": 0.2592630088329315,
"learning_rate": 6.070369573641327e-07,
"loss": 0.3588,
"step": 2387
},
{
"epoch": 2.572351885098743,
"grad_norm": 0.25368958711624146,
"learning_rate": 6.040457287693963e-07,
"loss": 0.3677,
"step": 2388
},
{
"epoch": 2.5734290843806105,
"grad_norm": 0.2784026265144348,
"learning_rate": 6.010614143023231e-07,
"loss": 0.4123,
"step": 2389
},
{
"epoch": 2.5745062836624775,
"grad_norm": 0.2718239724636078,
"learning_rate": 5.980840186567582e-07,
"loss": 0.3505,
"step": 2390
},
{
"epoch": 2.575583482944345,
"grad_norm": 0.27137839794158936,
"learning_rate": 5.951135465156649e-07,
"loss": 0.3843,
"step": 2391
},
{
"epoch": 2.576660682226212,
"grad_norm": 0.28062236309051514,
"learning_rate": 5.921500025511174e-07,
"loss": 0.397,
"step": 2392
},
{
"epoch": 2.577737881508079,
"grad_norm": 0.2782232165336609,
"learning_rate": 5.89193391424292e-07,
"loss": 0.3403,
"step": 2393
},
{
"epoch": 2.578815080789946,
"grad_norm": 0.2708394527435303,
"learning_rate": 5.862437177854629e-07,
"loss": 0.3857,
"step": 2394
},
{
"epoch": 2.5798922800718134,
"grad_norm": 0.2777789533138275,
"learning_rate": 5.833009862739919e-07,
"loss": 0.3518,
"step": 2395
},
{
"epoch": 2.5809694793536804,
"grad_norm": 0.2794034481048584,
"learning_rate": 5.803652015183192e-07,
"loss": 0.4027,
"step": 2396
},
{
"epoch": 2.582046678635548,
"grad_norm": 0.27170926332473755,
"learning_rate": 5.774363681359624e-07,
"loss": 0.3731,
"step": 2397
},
{
"epoch": 2.583123877917415,
"grad_norm": 0.2742154896259308,
"learning_rate": 5.745144907335043e-07,
"loss": 0.3842,
"step": 2398
},
{
"epoch": 2.584201077199282,
"grad_norm": 0.2668771743774414,
"learning_rate": 5.715995739065877e-07,
"loss": 0.3445,
"step": 2399
},
{
"epoch": 2.585278276481149,
"grad_norm": 0.2665964961051941,
"learning_rate": 5.686916222399069e-07,
"loss": 0.3923,
"step": 2400
},
{
"epoch": 2.5863554757630163,
"grad_norm": 0.27437713742256165,
"learning_rate": 5.657906403072e-07,
"loss": 0.3829,
"step": 2401
},
{
"epoch": 2.5874326750448833,
"grad_norm": 0.25516101717948914,
"learning_rate": 5.628966326712453e-07,
"loss": 0.3505,
"step": 2402
},
{
"epoch": 2.5885098743267507,
"grad_norm": 0.2651563882827759,
"learning_rate": 5.60009603883851e-07,
"loss": 0.3806,
"step": 2403
},
{
"epoch": 2.5895870736086177,
"grad_norm": 0.2705548107624054,
"learning_rate": 5.571295584858466e-07,
"loss": 0.3707,
"step": 2404
},
{
"epoch": 2.5906642728904847,
"grad_norm": 0.2829221189022064,
"learning_rate": 5.542565010070799e-07,
"loss": 0.3625,
"step": 2405
},
{
"epoch": 2.5917414721723517,
"grad_norm": 0.27439743280410767,
"learning_rate": 5.513904359664074e-07,
"loss": 0.3762,
"step": 2406
},
{
"epoch": 2.592818671454219,
"grad_norm": 0.26863783597946167,
"learning_rate": 5.485313678716875e-07,
"loss": 0.3776,
"step": 2407
},
{
"epoch": 2.593895870736086,
"grad_norm": 0.26877492666244507,
"learning_rate": 5.456793012197736e-07,
"loss": 0.3579,
"step": 2408
},
{
"epoch": 2.594973070017953,
"grad_norm": 0.2610160708427429,
"learning_rate": 5.428342404965076e-07,
"loss": 0.3658,
"step": 2409
},
{
"epoch": 2.5960502692998206,
"grad_norm": 0.27619680762290955,
"learning_rate": 5.399961901767115e-07,
"loss": 0.4095,
"step": 2410
},
{
"epoch": 2.5971274685816876,
"grad_norm": 0.2557905912399292,
"learning_rate": 5.371651547241802e-07,
"loss": 0.344,
"step": 2411
},
{
"epoch": 2.5982046678635546,
"grad_norm": 0.2994793951511383,
"learning_rate": 5.343411385916769e-07,
"loss": 0.4031,
"step": 2412
},
{
"epoch": 2.5992818671454216,
"grad_norm": 0.2592993676662445,
"learning_rate": 5.315241462209231e-07,
"loss": 0.3794,
"step": 2413
},
{
"epoch": 2.600359066427289,
"grad_norm": 0.2614355683326721,
"learning_rate": 5.287141820425945e-07,
"loss": 0.3978,
"step": 2414
},
{
"epoch": 2.601436265709156,
"grad_norm": 0.28487980365753174,
"learning_rate": 5.259112504763115e-07,
"loss": 0.3626,
"step": 2415
},
{
"epoch": 2.6025134649910235,
"grad_norm": 0.2808648645877838,
"learning_rate": 5.23115355930634e-07,
"loss": 0.3786,
"step": 2416
},
{
"epoch": 2.6035906642728905,
"grad_norm": 0.29835447669029236,
"learning_rate": 5.203265028030541e-07,
"loss": 0.3756,
"step": 2417
},
{
"epoch": 2.6046678635547575,
"grad_norm": 0.28749880194664,
"learning_rate": 5.175446954799874e-07,
"loss": 0.3932,
"step": 2418
},
{
"epoch": 2.6057450628366245,
"grad_norm": 0.27930009365081787,
"learning_rate": 5.147699383367705e-07,
"loss": 0.3375,
"step": 2419
},
{
"epoch": 2.606822262118492,
"grad_norm": 0.2683153450489044,
"learning_rate": 5.120022357376464e-07,
"loss": 0.3645,
"step": 2420
},
{
"epoch": 2.607899461400359,
"grad_norm": 0.28572866320610046,
"learning_rate": 5.092415920357674e-07,
"loss": 0.3883,
"step": 2421
},
{
"epoch": 2.6089766606822264,
"grad_norm": 0.27519190311431885,
"learning_rate": 5.064880115731796e-07,
"loss": 0.3894,
"step": 2422
},
{
"epoch": 2.6100538599640934,
"grad_norm": 0.2632756233215332,
"learning_rate": 5.03741498680822e-07,
"loss": 0.382,
"step": 2423
},
{
"epoch": 2.6111310592459605,
"grad_norm": 0.24517042934894562,
"learning_rate": 5.010020576785174e-07,
"loss": 0.3316,
"step": 2424
},
{
"epoch": 2.6122082585278275,
"grad_norm": 0.3185754418373108,
"learning_rate": 4.982696928749642e-07,
"loss": 0.4429,
"step": 2425
},
{
"epoch": 2.613285457809695,
"grad_norm": 0.2543036937713623,
"learning_rate": 4.955444085677319e-07,
"loss": 0.3481,
"step": 2426
},
{
"epoch": 2.614362657091562,
"grad_norm": 0.27936050295829773,
"learning_rate": 4.928262090432556e-07,
"loss": 0.3887,
"step": 2427
},
{
"epoch": 2.6154398563734294,
"grad_norm": 0.28514647483825684,
"learning_rate": 4.901150985768216e-07,
"loss": 0.3859,
"step": 2428
},
{
"epoch": 2.6165170556552964,
"grad_norm": 0.24403955042362213,
"learning_rate": 4.874110814325723e-07,
"loss": 0.3109,
"step": 2429
},
{
"epoch": 2.6175942549371634,
"grad_norm": 0.2888749837875366,
"learning_rate": 4.847141618634899e-07,
"loss": 0.3842,
"step": 2430
},
{
"epoch": 2.6186714542190304,
"grad_norm": 0.2979486882686615,
"learning_rate": 4.820243441113942e-07,
"loss": 0.4229,
"step": 2431
},
{
"epoch": 2.619748653500898,
"grad_norm": 0.25003379583358765,
"learning_rate": 4.793416324069372e-07,
"loss": 0.3338,
"step": 2432
},
{
"epoch": 2.620825852782765,
"grad_norm": 0.2611185908317566,
"learning_rate": 4.7666603096958865e-07,
"loss": 0.3617,
"step": 2433
},
{
"epoch": 2.621903052064632,
"grad_norm": 0.25409772992134094,
"learning_rate": 4.739975440076405e-07,
"loss": 0.3648,
"step": 2434
},
{
"epoch": 2.6229802513464993,
"grad_norm": 0.25264662504196167,
"learning_rate": 4.713361757181917e-07,
"loss": 0.3572,
"step": 2435
},
{
"epoch": 2.6240574506283663,
"grad_norm": 0.2889323830604553,
"learning_rate": 4.6868193028714814e-07,
"loss": 0.3884,
"step": 2436
},
{
"epoch": 2.6251346499102333,
"grad_norm": 0.27489691972732544,
"learning_rate": 4.6603481188920664e-07,
"loss": 0.39,
"step": 2437
},
{
"epoch": 2.6262118491921003,
"grad_norm": 0.27223941683769226,
"learning_rate": 4.6339482468786e-07,
"loss": 0.3569,
"step": 2438
},
{
"epoch": 2.6272890484739677,
"grad_norm": 0.2794973850250244,
"learning_rate": 4.607619728353818e-07,
"loss": 0.3976,
"step": 2439
},
{
"epoch": 2.6283662477558347,
"grad_norm": 0.2633703351020813,
"learning_rate": 4.581362604728246e-07,
"loss": 0.3686,
"step": 2440
},
{
"epoch": 2.629443447037702,
"grad_norm": 0.26582613587379456,
"learning_rate": 4.5551769173001024e-07,
"loss": 0.3471,
"step": 2441
},
{
"epoch": 2.630520646319569,
"grad_norm": 0.27101296186447144,
"learning_rate": 4.529062707255261e-07,
"loss": 0.3827,
"step": 2442
},
{
"epoch": 2.631597845601436,
"grad_norm": 0.28216707706451416,
"learning_rate": 4.5030200156671534e-07,
"loss": 0.3713,
"step": 2443
},
{
"epoch": 2.632675044883303,
"grad_norm": 0.2950674891471863,
"learning_rate": 4.4770488834967486e-07,
"loss": 0.3577,
"step": 2444
},
{
"epoch": 2.6337522441651706,
"grad_norm": 0.2715361714363098,
"learning_rate": 4.4511493515924373e-07,
"loss": 0.3805,
"step": 2445
},
{
"epoch": 2.6348294434470376,
"grad_norm": 0.27372029423713684,
"learning_rate": 4.425321460690024e-07,
"loss": 0.3833,
"step": 2446
},
{
"epoch": 2.635906642728905,
"grad_norm": 0.2518730163574219,
"learning_rate": 4.3995652514126077e-07,
"loss": 0.3553,
"step": 2447
},
{
"epoch": 2.636983842010772,
"grad_norm": 0.2758457064628601,
"learning_rate": 4.3738807642705663e-07,
"loss": 0.3737,
"step": 2448
},
{
"epoch": 2.638061041292639,
"grad_norm": 0.2756761908531189,
"learning_rate": 4.348268039661452e-07,
"loss": 0.3953,
"step": 2449
},
{
"epoch": 2.639138240574506,
"grad_norm": 0.26307833194732666,
"learning_rate": 4.322727117869951e-07,
"loss": 0.3816,
"step": 2450
},
{
"epoch": 2.6402154398563735,
"grad_norm": 0.2856947183609009,
"learning_rate": 4.2972580390678307e-07,
"loss": 0.3782,
"step": 2451
},
{
"epoch": 2.6412926391382405,
"grad_norm": 0.28101542592048645,
"learning_rate": 4.271860843313835e-07,
"loss": 0.3745,
"step": 2452
},
{
"epoch": 2.642369838420108,
"grad_norm": 0.26329493522644043,
"learning_rate": 4.246535570553667e-07,
"loss": 0.3696,
"step": 2453
},
{
"epoch": 2.643447037701975,
"grad_norm": 0.25769349932670593,
"learning_rate": 4.221282260619891e-07,
"loss": 0.3783,
"step": 2454
},
{
"epoch": 2.644524236983842,
"grad_norm": 0.26422208547592163,
"learning_rate": 4.196100953231896e-07,
"loss": 0.3716,
"step": 2455
},
{
"epoch": 2.645601436265709,
"grad_norm": 0.25533580780029297,
"learning_rate": 4.1709916879958237e-07,
"loss": 0.3562,
"step": 2456
},
{
"epoch": 2.6466786355475764,
"grad_norm": 0.2956748902797699,
"learning_rate": 4.145954504404498e-07,
"loss": 0.4045,
"step": 2457
},
{
"epoch": 2.6477558348294434,
"grad_norm": 0.2733031213283539,
"learning_rate": 4.120989441837381e-07,
"loss": 0.3681,
"step": 2458
},
{
"epoch": 2.6488330341113104,
"grad_norm": 0.2781358063220978,
"learning_rate": 4.0960965395605015e-07,
"loss": 0.365,
"step": 2459
},
{
"epoch": 2.649910233393178,
"grad_norm": 0.2737163007259369,
"learning_rate": 4.0712758367263573e-07,
"loss": 0.3497,
"step": 2460
},
{
"epoch": 2.650987432675045,
"grad_norm": 0.28244391083717346,
"learning_rate": 4.046527372373932e-07,
"loss": 0.3984,
"step": 2461
},
{
"epoch": 2.652064631956912,
"grad_norm": 0.2620738446712494,
"learning_rate": 4.021851185428566e-07,
"loss": 0.3524,
"step": 2462
},
{
"epoch": 2.6531418312387793,
"grad_norm": 0.2576216459274292,
"learning_rate": 3.9972473147019354e-07,
"loss": 0.3849,
"step": 2463
},
{
"epoch": 2.6542190305206463,
"grad_norm": 0.2805669903755188,
"learning_rate": 3.972715798891952e-07,
"loss": 0.3732,
"step": 2464
},
{
"epoch": 2.6552962298025133,
"grad_norm": 0.2930372655391693,
"learning_rate": 3.9482566765827346e-07,
"loss": 0.4041,
"step": 2465
},
{
"epoch": 2.656373429084381,
"grad_norm": 0.2730962932109833,
"learning_rate": 3.92386998624455e-07,
"loss": 0.3635,
"step": 2466
},
{
"epoch": 2.657450628366248,
"grad_norm": 0.30867379903793335,
"learning_rate": 3.899555766233726e-07,
"loss": 0.4031,
"step": 2467
},
{
"epoch": 2.658527827648115,
"grad_norm": 0.24889758229255676,
"learning_rate": 3.8753140547926224e-07,
"loss": 0.3513,
"step": 2468
},
{
"epoch": 2.659605026929982,
"grad_norm": 0.2548186779022217,
"learning_rate": 3.851144890049535e-07,
"loss": 0.3589,
"step": 2469
},
{
"epoch": 2.6606822262118492,
"grad_norm": 0.26514631509780884,
"learning_rate": 3.827048310018661e-07,
"loss": 0.386,
"step": 2470
},
{
"epoch": 2.6617594254937162,
"grad_norm": 0.28183451294898987,
"learning_rate": 3.803024352600049e-07,
"loss": 0.4103,
"step": 2471
},
{
"epoch": 2.6628366247755837,
"grad_norm": 0.2597326636314392,
"learning_rate": 3.7790730555795076e-07,
"loss": 0.3604,
"step": 2472
},
{
"epoch": 2.6639138240574507,
"grad_norm": 0.25949424505233765,
"learning_rate": 3.755194456628569e-07,
"loss": 0.3447,
"step": 2473
},
{
"epoch": 2.6649910233393177,
"grad_norm": 0.27464714646339417,
"learning_rate": 3.731388593304425e-07,
"loss": 0.3852,
"step": 2474
},
{
"epoch": 2.6660682226211847,
"grad_norm": 0.2679389417171478,
"learning_rate": 3.7076555030498505e-07,
"loss": 0.3914,
"step": 2475
},
{
"epoch": 2.667145421903052,
"grad_norm": 0.25600937008857727,
"learning_rate": 3.6839952231931877e-07,
"loss": 0.3257,
"step": 2476
},
{
"epoch": 2.668222621184919,
"grad_norm": 0.2627735137939453,
"learning_rate": 3.6604077909482283e-07,
"loss": 0.377,
"step": 2477
},
{
"epoch": 2.6692998204667866,
"grad_norm": 0.29077112674713135,
"learning_rate": 3.636893243414208e-07,
"loss": 0.4152,
"step": 2478
},
{
"epoch": 2.6703770197486536,
"grad_norm": 0.2747618854045868,
"learning_rate": 3.6134516175757193e-07,
"loss": 0.3744,
"step": 2479
},
{
"epoch": 2.6714542190305206,
"grad_norm": 0.2674185037612915,
"learning_rate": 3.5900829503026644e-07,
"loss": 0.3881,
"step": 2480
},
{
"epoch": 2.6725314183123876,
"grad_norm": 0.2776051163673401,
"learning_rate": 3.5667872783501924e-07,
"loss": 0.3811,
"step": 2481
},
{
"epoch": 2.673608617594255,
"grad_norm": 0.32454177737236023,
"learning_rate": 3.5435646383586374e-07,
"loss": 0.3891,
"step": 2482
},
{
"epoch": 2.674685816876122,
"grad_norm": 0.2767637073993683,
"learning_rate": 3.520415066853483e-07,
"loss": 0.3896,
"step": 2483
},
{
"epoch": 2.6757630161579895,
"grad_norm": 0.2673652768135071,
"learning_rate": 3.497338600245254e-07,
"loss": 0.3767,
"step": 2484
},
{
"epoch": 2.6768402154398565,
"grad_norm": 0.26100921630859375,
"learning_rate": 3.474335274829532e-07,
"loss": 0.3611,
"step": 2485
},
{
"epoch": 2.6779174147217235,
"grad_norm": 0.2723924219608307,
"learning_rate": 3.4514051267868275e-07,
"loss": 0.3531,
"step": 2486
},
{
"epoch": 2.6789946140035905,
"grad_norm": 0.2669923007488251,
"learning_rate": 3.428548192182568e-07,
"loss": 0.3876,
"step": 2487
},
{
"epoch": 2.680071813285458,
"grad_norm": 0.27219128608703613,
"learning_rate": 3.4057645069670353e-07,
"loss": 0.3646,
"step": 2488
},
{
"epoch": 2.681149012567325,
"grad_norm": 0.26371678709983826,
"learning_rate": 3.383054106975292e-07,
"loss": 0.3396,
"step": 2489
},
{
"epoch": 2.682226211849192,
"grad_norm": 0.2728726267814636,
"learning_rate": 3.3604170279271375e-07,
"loss": 0.3623,
"step": 2490
},
{
"epoch": 2.6833034111310594,
"grad_norm": 0.24541893601417542,
"learning_rate": 3.337853305427063e-07,
"loss": 0.3716,
"step": 2491
},
{
"epoch": 2.6843806104129264,
"grad_norm": 0.262883722782135,
"learning_rate": 3.315362974964142e-07,
"loss": 0.3806,
"step": 2492
},
{
"epoch": 2.6854578096947934,
"grad_norm": 0.26691654324531555,
"learning_rate": 3.292946071912051e-07,
"loss": 0.3643,
"step": 2493
},
{
"epoch": 2.6865350089766604,
"grad_norm": 0.2629173696041107,
"learning_rate": 3.270602631528968e-07,
"loss": 0.3559,
"step": 2494
},
{
"epoch": 2.687612208258528,
"grad_norm": 0.27307501435279846,
"learning_rate": 3.2483326889575394e-07,
"loss": 0.3508,
"step": 2495
},
{
"epoch": 2.688689407540395,
"grad_norm": 0.2881261110305786,
"learning_rate": 3.226136279224762e-07,
"loss": 0.409,
"step": 2496
},
{
"epoch": 2.6897666068222623,
"grad_norm": 0.27742505073547363,
"learning_rate": 3.2040134372420373e-07,
"loss": 0.3763,
"step": 2497
},
{
"epoch": 2.6908438061041293,
"grad_norm": 0.2708519697189331,
"learning_rate": 3.1819641978050207e-07,
"loss": 0.3737,
"step": 2498
},
{
"epoch": 2.6919210053859963,
"grad_norm": 0.2512258291244507,
"learning_rate": 3.159988595593616e-07,
"loss": 0.3473,
"step": 2499
},
{
"epoch": 2.6929982046678633,
"grad_norm": 0.27164483070373535,
"learning_rate": 3.1380866651719075e-07,
"loss": 0.399,
"step": 2500
},
{
"epoch": 2.6940754039497308,
"grad_norm": 0.2618269920349121,
"learning_rate": 3.1162584409880904e-07,
"loss": 0.3835,
"step": 2501
},
{
"epoch": 2.6951526032315978,
"grad_norm": 0.24939392507076263,
"learning_rate": 3.0945039573744564e-07,
"loss": 0.3768,
"step": 2502
},
{
"epoch": 2.6962298025134652,
"grad_norm": 0.2539052665233612,
"learning_rate": 3.0728232485472967e-07,
"loss": 0.3644,
"step": 2503
},
{
"epoch": 2.6973070017953322,
"grad_norm": 0.27746522426605225,
"learning_rate": 3.051216348606867e-07,
"loss": 0.3805,
"step": 2504
},
{
"epoch": 2.6983842010771992,
"grad_norm": 0.2758882939815521,
"learning_rate": 3.02968329153735e-07,
"loss": 0.3835,
"step": 2505
},
{
"epoch": 2.6994614003590662,
"grad_norm": 0.26927635073661804,
"learning_rate": 3.0082241112067755e-07,
"loss": 0.3565,
"step": 2506
},
{
"epoch": 2.7005385996409337,
"grad_norm": 0.26184743642807007,
"learning_rate": 2.986838841366962e-07,
"loss": 0.3568,
"step": 2507
},
{
"epoch": 2.7016157989228007,
"grad_norm": 0.2422487586736679,
"learning_rate": 2.96552751565351e-07,
"loss": 0.3557,
"step": 2508
},
{
"epoch": 2.702692998204668,
"grad_norm": 0.27083149552345276,
"learning_rate": 2.944290167585684e-07,
"loss": 0.3853,
"step": 2509
},
{
"epoch": 2.703770197486535,
"grad_norm": 0.27866464853286743,
"learning_rate": 2.9231268305664193e-07,
"loss": 0.3637,
"step": 2510
},
{
"epoch": 2.704847396768402,
"grad_norm": 0.26805853843688965,
"learning_rate": 2.9020375378822297e-07,
"loss": 0.3994,
"step": 2511
},
{
"epoch": 2.705924596050269,
"grad_norm": 0.25744813680648804,
"learning_rate": 2.8810223227031753e-07,
"loss": 0.3614,
"step": 2512
},
{
"epoch": 2.7070017953321366,
"grad_norm": 0.2774718999862671,
"learning_rate": 2.860081218082805e-07,
"loss": 0.3848,
"step": 2513
},
{
"epoch": 2.7080789946140036,
"grad_norm": 0.25574401021003723,
"learning_rate": 2.839214256958106e-07,
"loss": 0.3555,
"step": 2514
},
{
"epoch": 2.7091561938958706,
"grad_norm": 0.2627975046634674,
"learning_rate": 2.818421472149446e-07,
"loss": 0.3541,
"step": 2515
},
{
"epoch": 2.710233393177738,
"grad_norm": 0.2711409032344818,
"learning_rate": 2.7977028963605214e-07,
"loss": 0.3993,
"step": 2516
},
{
"epoch": 2.711310592459605,
"grad_norm": 0.26041677594184875,
"learning_rate": 2.7770585621782973e-07,
"loss": 0.3713,
"step": 2517
},
{
"epoch": 2.712387791741472,
"grad_norm": 0.263475239276886,
"learning_rate": 2.756488502073007e-07,
"loss": 0.3449,
"step": 2518
},
{
"epoch": 2.713464991023339,
"grad_norm": 0.2730863094329834,
"learning_rate": 2.7359927483980254e-07,
"loss": 0.3822,
"step": 2519
},
{
"epoch": 2.7145421903052065,
"grad_norm": 0.3727039098739624,
"learning_rate": 2.7155713333898826e-07,
"loss": 0.3547,
"step": 2520
},
{
"epoch": 2.7156193895870735,
"grad_norm": 0.29259535670280457,
"learning_rate": 2.6952242891681635e-07,
"loss": 0.3839,
"step": 2521
},
{
"epoch": 2.716696588868941,
"grad_norm": 0.29923248291015625,
"learning_rate": 2.674951647735491e-07,
"loss": 0.4094,
"step": 2522
},
{
"epoch": 2.717773788150808,
"grad_norm": 0.25811460614204407,
"learning_rate": 2.654753440977481e-07,
"loss": 0.3599,
"step": 2523
},
{
"epoch": 2.718850987432675,
"grad_norm": 0.2672567367553711,
"learning_rate": 2.634629700662628e-07,
"loss": 0.3805,
"step": 2524
},
{
"epoch": 2.719928186714542,
"grad_norm": 0.2677706778049469,
"learning_rate": 2.6145804584423505e-07,
"loss": 0.3528,
"step": 2525
},
{
"epoch": 2.7210053859964094,
"grad_norm": 0.2716928720474243,
"learning_rate": 2.5946057458508757e-07,
"loss": 0.3736,
"step": 2526
},
{
"epoch": 2.7220825852782764,
"grad_norm": 0.27958476543426514,
"learning_rate": 2.5747055943052044e-07,
"loss": 0.3762,
"step": 2527
},
{
"epoch": 2.723159784560144,
"grad_norm": 0.2665049135684967,
"learning_rate": 2.5548800351050673e-07,
"loss": 0.3379,
"step": 2528
},
{
"epoch": 2.724236983842011,
"grad_norm": 0.27530208230018616,
"learning_rate": 2.5351290994328703e-07,
"loss": 0.4127,
"step": 2529
},
{
"epoch": 2.725314183123878,
"grad_norm": 0.24859978258609772,
"learning_rate": 2.5154528183536584e-07,
"loss": 0.3384,
"step": 2530
},
{
"epoch": 2.726391382405745,
"grad_norm": 0.26949816942214966,
"learning_rate": 2.495851222815049e-07,
"loss": 0.4023,
"step": 2531
},
{
"epoch": 2.7274685816876123,
"grad_norm": 0.2622813284397125,
"learning_rate": 2.476324343647202e-07,
"loss": 0.3549,
"step": 2532
},
{
"epoch": 2.7285457809694793,
"grad_norm": 0.2698315978050232,
"learning_rate": 2.456872211562733e-07,
"loss": 0.3559,
"step": 2533
},
{
"epoch": 2.7296229802513468,
"grad_norm": 0.26423153281211853,
"learning_rate": 2.4374948571567246e-07,
"loss": 0.3805,
"step": 2534
},
{
"epoch": 2.7307001795332138,
"grad_norm": 0.262080579996109,
"learning_rate": 2.4181923109066254e-07,
"loss": 0.3553,
"step": 2535
},
{
"epoch": 2.7317773788150808,
"grad_norm": 0.2714017331600189,
"learning_rate": 2.398964603172238e-07,
"loss": 0.3891,
"step": 2536
},
{
"epoch": 2.7328545780969478,
"grad_norm": 0.26109498739242554,
"learning_rate": 2.3798117641956498e-07,
"loss": 0.3773,
"step": 2537
},
{
"epoch": 2.733931777378815,
"grad_norm": 0.2694284915924072,
"learning_rate": 2.3607338241011745e-07,
"loss": 0.3514,
"step": 2538
},
{
"epoch": 2.735008976660682,
"grad_norm": 0.2672230005264282,
"learning_rate": 2.3417308128953486e-07,
"loss": 0.3893,
"step": 2539
},
{
"epoch": 2.736086175942549,
"grad_norm": 0.26709240674972534,
"learning_rate": 2.3228027604668523e-07,
"loss": 0.3556,
"step": 2540
},
{
"epoch": 2.7371633752244167,
"grad_norm": 0.27405399084091187,
"learning_rate": 2.303949696586444e-07,
"loss": 0.3685,
"step": 2541
},
{
"epoch": 2.7382405745062837,
"grad_norm": 0.2825201451778412,
"learning_rate": 2.28517165090697e-07,
"loss": 0.3937,
"step": 2542
},
{
"epoch": 2.7393177737881507,
"grad_norm": 0.2691894769668579,
"learning_rate": 2.2664686529632608e-07,
"loss": 0.3695,
"step": 2543
},
{
"epoch": 2.740394973070018,
"grad_norm": 0.2729867696762085,
"learning_rate": 2.2478407321721295e-07,
"loss": 0.3741,
"step": 2544
},
{
"epoch": 2.741472172351885,
"grad_norm": 0.26905032992362976,
"learning_rate": 2.2292879178322845e-07,
"loss": 0.389,
"step": 2545
},
{
"epoch": 2.742549371633752,
"grad_norm": 0.2744308412075043,
"learning_rate": 2.2108102391243114e-07,
"loss": 0.382,
"step": 2546
},
{
"epoch": 2.7436265709156196,
"grad_norm": 0.2637563645839691,
"learning_rate": 2.1924077251106346e-07,
"loss": 0.3387,
"step": 2547
},
{
"epoch": 2.7447037701974866,
"grad_norm": 0.2808258831501007,
"learning_rate": 2.1740804047354348e-07,
"loss": 0.4189,
"step": 2548
},
{
"epoch": 2.7457809694793536,
"grad_norm": 0.29728829860687256,
"learning_rate": 2.1558283068246254e-07,
"loss": 0.3739,
"step": 2549
},
{
"epoch": 2.7468581687612206,
"grad_norm": 0.254711776971817,
"learning_rate": 2.1376514600858212e-07,
"loss": 0.3437,
"step": 2550
},
{
"epoch": 2.747935368043088,
"grad_norm": 0.2539973556995392,
"learning_rate": 2.1195498931082748e-07,
"loss": 0.3561,
"step": 2551
},
{
"epoch": 2.749012567324955,
"grad_norm": 0.27282753586769104,
"learning_rate": 2.101523634362834e-07,
"loss": 0.3857,
"step": 2552
},
{
"epoch": 2.7500897666068225,
"grad_norm": 0.2659188210964203,
"learning_rate": 2.0835727122018978e-07,
"loss": 0.392,
"step": 2553
},
{
"epoch": 2.7511669658886895,
"grad_norm": 0.2596139907836914,
"learning_rate": 2.065697154859375e-07,
"loss": 0.3891,
"step": 2554
},
{
"epoch": 2.7522441651705565,
"grad_norm": 0.24862676858901978,
"learning_rate": 2.0478969904506373e-07,
"loss": 0.3352,
"step": 2555
},
{
"epoch": 2.7533213644524235,
"grad_norm": 0.2672061324119568,
"learning_rate": 2.0301722469724728e-07,
"loss": 0.4056,
"step": 2556
},
{
"epoch": 2.754398563734291,
"grad_norm": 0.2543858289718628,
"learning_rate": 2.012522952303042e-07,
"loss": 0.3626,
"step": 2557
},
{
"epoch": 2.755475763016158,
"grad_norm": 0.26948270201683044,
"learning_rate": 1.9949491342018568e-07,
"loss": 0.3884,
"step": 2558
},
{
"epoch": 2.7565529622980254,
"grad_norm": 0.2788434624671936,
"learning_rate": 1.9774508203096843e-07,
"loss": 0.3752,
"step": 2559
},
{
"epoch": 2.7576301615798924,
"grad_norm": 0.27419593930244446,
"learning_rate": 1.9600280381485537e-07,
"loss": 0.3721,
"step": 2560
},
{
"epoch": 2.7587073608617594,
"grad_norm": 0.2659021019935608,
"learning_rate": 1.9426808151217002e-07,
"loss": 0.3973,
"step": 2561
},
{
"epoch": 2.7597845601436264,
"grad_norm": 0.25331631302833557,
"learning_rate": 1.9254091785135154e-07,
"loss": 0.3541,
"step": 2562
},
{
"epoch": 2.760861759425494,
"grad_norm": 0.26807159185409546,
"learning_rate": 1.9082131554894857e-07,
"loss": 0.3751,
"step": 2563
},
{
"epoch": 2.761938958707361,
"grad_norm": 0.27599242329597473,
"learning_rate": 1.8910927730962038e-07,
"loss": 0.3777,
"step": 2564
},
{
"epoch": 2.7630161579892283,
"grad_norm": 0.2834921181201935,
"learning_rate": 1.874048058261252e-07,
"loss": 0.3834,
"step": 2565
},
{
"epoch": 2.7640933572710953,
"grad_norm": 0.23908206820487976,
"learning_rate": 1.8570790377932302e-07,
"loss": 0.3368,
"step": 2566
},
{
"epoch": 2.7651705565529623,
"grad_norm": 0.27744874358177185,
"learning_rate": 1.8401857383816667e-07,
"loss": 0.3842,
"step": 2567
},
{
"epoch": 2.7662477558348293,
"grad_norm": 0.2755833566188812,
"learning_rate": 1.8233681865970076e-07,
"loss": 0.3594,
"step": 2568
},
{
"epoch": 2.7673249551166967,
"grad_norm": 0.2750590145587921,
"learning_rate": 1.806626408890555e-07,
"loss": 0.3762,
"step": 2569
},
{
"epoch": 2.7684021543985637,
"grad_norm": 0.26196321845054626,
"learning_rate": 1.789960431594412e-07,
"loss": 0.3395,
"step": 2570
},
{
"epoch": 2.7694793536804307,
"grad_norm": 0.2838890552520752,
"learning_rate": 1.7733702809214826e-07,
"loss": 0.3816,
"step": 2571
},
{
"epoch": 2.770556552962298,
"grad_norm": 0.2661447823047638,
"learning_rate": 1.7568559829654107e-07,
"loss": 0.3941,
"step": 2572
},
{
"epoch": 2.771633752244165,
"grad_norm": 0.26285287737846375,
"learning_rate": 1.7404175637005083e-07,
"loss": 0.359,
"step": 2573
},
{
"epoch": 2.772710951526032,
"grad_norm": 0.2819158434867859,
"learning_rate": 1.7240550489817652e-07,
"loss": 0.3722,
"step": 2574
},
{
"epoch": 2.773788150807899,
"grad_norm": 0.267031192779541,
"learning_rate": 1.7077684645447846e-07,
"loss": 0.3265,
"step": 2575
},
{
"epoch": 2.7748653500897666,
"grad_norm": 0.27363553643226624,
"learning_rate": 1.6915578360057417e-07,
"loss": 0.4008,
"step": 2576
},
{
"epoch": 2.7759425493716336,
"grad_norm": 0.2736669182777405,
"learning_rate": 1.6754231888613304e-07,
"loss": 0.3864,
"step": 2577
},
{
"epoch": 2.777019748653501,
"grad_norm": 0.2563254237174988,
"learning_rate": 1.6593645484887677e-07,
"loss": 0.3768,
"step": 2578
},
{
"epoch": 2.778096947935368,
"grad_norm": 0.28097283840179443,
"learning_rate": 1.6433819401456996e-07,
"loss": 0.3893,
"step": 2579
},
{
"epoch": 2.779174147217235,
"grad_norm": 0.2700108289718628,
"learning_rate": 1.62747538897019e-07,
"loss": 0.3506,
"step": 2580
},
{
"epoch": 2.780251346499102,
"grad_norm": 0.2690648138523102,
"learning_rate": 1.611644919980676e-07,
"loss": 0.3503,
"step": 2581
},
{
"epoch": 2.7813285457809696,
"grad_norm": 0.2958345115184784,
"learning_rate": 1.5958905580759464e-07,
"loss": 0.3996,
"step": 2582
},
{
"epoch": 2.7824057450628366,
"grad_norm": 0.2642570734024048,
"learning_rate": 1.5802123280350634e-07,
"loss": 0.389,
"step": 2583
},
{
"epoch": 2.783482944344704,
"grad_norm": 0.25878316164016724,
"learning_rate": 1.5646102545173625e-07,
"loss": 0.3485,
"step": 2584
},
{
"epoch": 2.784560143626571,
"grad_norm": 0.2650509178638458,
"learning_rate": 1.5490843620623865e-07,
"loss": 0.3921,
"step": 2585
},
{
"epoch": 2.785637342908438,
"grad_norm": 0.2722346782684326,
"learning_rate": 1.5336346750898678e-07,
"loss": 0.3795,
"step": 2586
},
{
"epoch": 2.786714542190305,
"grad_norm": 0.2627926170825958,
"learning_rate": 1.5182612178996803e-07,
"loss": 0.349,
"step": 2587
},
{
"epoch": 2.7877917414721725,
"grad_norm": 0.28062254190444946,
"learning_rate": 1.5029640146717762e-07,
"loss": 0.3906,
"step": 2588
},
{
"epoch": 2.7888689407540395,
"grad_norm": 0.27385613322257996,
"learning_rate": 1.4877430894662037e-07,
"loss": 0.3863,
"step": 2589
},
{
"epoch": 2.789946140035907,
"grad_norm": 0.270166277885437,
"learning_rate": 1.472598466223024e-07,
"loss": 0.3743,
"step": 2590
},
{
"epoch": 2.791023339317774,
"grad_norm": 0.24716700613498688,
"learning_rate": 1.457530168762289e-07,
"loss": 0.3562,
"step": 2591
},
{
"epoch": 2.792100538599641,
"grad_norm": 0.24457189440727234,
"learning_rate": 1.4425382207839955e-07,
"loss": 0.3454,
"step": 2592
},
{
"epoch": 2.793177737881508,
"grad_norm": 0.2678474187850952,
"learning_rate": 1.4276226458680653e-07,
"loss": 0.3815,
"step": 2593
},
{
"epoch": 2.7942549371633754,
"grad_norm": 0.2846844792366028,
"learning_rate": 1.412783467474299e-07,
"loss": 0.4095,
"step": 2594
},
{
"epoch": 2.7953321364452424,
"grad_norm": 0.2793724536895752,
"learning_rate": 1.3980207089423326e-07,
"loss": 0.3911,
"step": 2595
},
{
"epoch": 2.7964093357271094,
"grad_norm": 0.2453213334083557,
"learning_rate": 1.3833343934916032e-07,
"loss": 0.3559,
"step": 2596
},
{
"epoch": 2.797486535008977,
"grad_norm": 0.28029677271842957,
"learning_rate": 1.368724544221317e-07,
"loss": 0.3974,
"step": 2597
},
{
"epoch": 2.798563734290844,
"grad_norm": 0.23296788334846497,
"learning_rate": 1.3541911841104149e-07,
"loss": 0.326,
"step": 2598
},
{
"epoch": 2.799640933572711,
"grad_norm": 0.28019076585769653,
"learning_rate": 1.3397343360175287e-07,
"loss": 0.4153,
"step": 2599
},
{
"epoch": 2.800718132854578,
"grad_norm": 0.2549351751804352,
"learning_rate": 1.3253540226809524e-07,
"loss": 0.3551,
"step": 2600
},
{
"epoch": 2.8017953321364453,
"grad_norm": 0.2683469355106354,
"learning_rate": 1.3110502667186e-07,
"loss": 0.3321,
"step": 2601
},
{
"epoch": 2.8028725314183123,
"grad_norm": 0.25505879521369934,
"learning_rate": 1.2968230906279745e-07,
"loss": 0.3689,
"step": 2602
},
{
"epoch": 2.8039497307001797,
"grad_norm": 0.28051871061325073,
"learning_rate": 1.282672516786132e-07,
"loss": 0.4006,
"step": 2603
},
{
"epoch": 2.8050269299820467,
"grad_norm": 0.24571263790130615,
"learning_rate": 1.268598567449647e-07,
"loss": 0.3344,
"step": 2604
},
{
"epoch": 2.8061041292639137,
"grad_norm": 0.27598923444747925,
"learning_rate": 1.2546012647545735e-07,
"loss": 0.4113,
"step": 2605
},
{
"epoch": 2.8071813285457807,
"grad_norm": 0.2610470950603485,
"learning_rate": 1.240680630716401e-07,
"loss": 0.3859,
"step": 2606
},
{
"epoch": 2.808258527827648,
"grad_norm": 0.2540163993835449,
"learning_rate": 1.2268366872300596e-07,
"loss": 0.3663,
"step": 2607
},
{
"epoch": 2.809335727109515,
"grad_norm": 0.26400068402290344,
"learning_rate": 1.2130694560698376e-07,
"loss": 0.361,
"step": 2608
},
{
"epoch": 2.8104129263913826,
"grad_norm": 0.26972126960754395,
"learning_rate": 1.1993789588893634e-07,
"loss": 0.379,
"step": 2609
},
{
"epoch": 2.8114901256732496,
"grad_norm": 0.28470125794410706,
"learning_rate": 1.1857652172215905e-07,
"loss": 0.3672,
"step": 2610
},
{
"epoch": 2.8125673249551166,
"grad_norm": 0.28178441524505615,
"learning_rate": 1.1722282524787465e-07,
"loss": 0.376,
"step": 2611
},
{
"epoch": 2.8136445242369836,
"grad_norm": 0.27330464124679565,
"learning_rate": 1.1587680859522832e-07,
"loss": 0.3682,
"step": 2612
},
{
"epoch": 2.814721723518851,
"grad_norm": 0.26830774545669556,
"learning_rate": 1.1453847388128714e-07,
"loss": 0.3501,
"step": 2613
},
{
"epoch": 2.815798922800718,
"grad_norm": 0.281019926071167,
"learning_rate": 1.1320782321103673e-07,
"loss": 0.4017,
"step": 2614
},
{
"epoch": 2.8168761220825855,
"grad_norm": 0.2757217288017273,
"learning_rate": 1.1188485867737631e-07,
"loss": 0.373,
"step": 2615
},
{
"epoch": 2.8179533213644525,
"grad_norm": 0.2641966640949249,
"learning_rate": 1.1056958236111526e-07,
"loss": 0.3667,
"step": 2616
},
{
"epoch": 2.8190305206463195,
"grad_norm": 0.257794588804245,
"learning_rate": 1.0926199633097156e-07,
"loss": 0.3732,
"step": 2617
},
{
"epoch": 2.8201077199281865,
"grad_norm": 0.2658846378326416,
"learning_rate": 1.0796210264356787e-07,
"loss": 0.3771,
"step": 2618
},
{
"epoch": 2.821184919210054,
"grad_norm": 0.27144813537597656,
"learning_rate": 1.0666990334342708e-07,
"loss": 0.3828,
"step": 2619
},
{
"epoch": 2.822262118491921,
"grad_norm": 0.251426100730896,
"learning_rate": 1.0538540046296952e-07,
"loss": 0.3348,
"step": 2620
},
{
"epoch": 2.823339317773788,
"grad_norm": 0.28454217314720154,
"learning_rate": 1.04108596022513e-07,
"loss": 0.3895,
"step": 2621
},
{
"epoch": 2.8244165170556554,
"grad_norm": 0.24853797256946564,
"learning_rate": 1.0283949203026333e-07,
"loss": 0.3614,
"step": 2622
},
{
"epoch": 2.8254937163375224,
"grad_norm": 0.2648945748806,
"learning_rate": 1.015780904823177e-07,
"loss": 0.3516,
"step": 2623
},
{
"epoch": 2.8265709156193894,
"grad_norm": 0.25767838954925537,
"learning_rate": 1.0032439336265742e-07,
"loss": 0.3738,
"step": 2624
},
{
"epoch": 2.827648114901257,
"grad_norm": 0.2541263997554779,
"learning_rate": 9.907840264314572e-08,
"loss": 0.354,
"step": 2625
},
{
"epoch": 2.828725314183124,
"grad_norm": 0.2915237247943878,
"learning_rate": 9.784012028352496e-08,
"loss": 0.3833,
"step": 2626
},
{
"epoch": 2.829802513464991,
"grad_norm": 0.2747429609298706,
"learning_rate": 9.660954823141443e-08,
"loss": 0.3822,
"step": 2627
},
{
"epoch": 2.8308797127468583,
"grad_norm": 0.2576390504837036,
"learning_rate": 9.538668842230536e-08,
"loss": 0.3304,
"step": 2628
},
{
"epoch": 2.8319569120287253,
"grad_norm": 0.24892747402191162,
"learning_rate": 9.417154277955864e-08,
"loss": 0.3679,
"step": 2629
},
{
"epoch": 2.8330341113105924,
"grad_norm": 0.26770374178886414,
"learning_rate": 9.29641132144038e-08,
"loss": 0.4067,
"step": 2630
},
{
"epoch": 2.8341113105924594,
"grad_norm": 0.27645477652549744,
"learning_rate": 9.17644016259317e-08,
"loss": 0.3675,
"step": 2631
},
{
"epoch": 2.835188509874327,
"grad_norm": 0.2851516008377075,
"learning_rate": 9.057240990109628e-08,
"loss": 0.395,
"step": 2632
},
{
"epoch": 2.836265709156194,
"grad_norm": 0.27293872833251953,
"learning_rate": 8.93881399147073e-08,
"loss": 0.3828,
"step": 2633
},
{
"epoch": 2.8373429084380613,
"grad_norm": 0.2663024663925171,
"learning_rate": 8.821159352943142e-08,
"loss": 0.3886,
"step": 2634
},
{
"epoch": 2.8384201077199283,
"grad_norm": 0.2755454480648041,
"learning_rate": 8.704277259578675e-08,
"loss": 0.3865,
"step": 2635
},
{
"epoch": 2.8394973070017953,
"grad_norm": 0.2564866840839386,
"learning_rate": 8.588167895213994e-08,
"loss": 0.3608,
"step": 2636
},
{
"epoch": 2.8405745062836623,
"grad_norm": 0.27022016048431396,
"learning_rate": 8.472831442470408e-08,
"loss": 0.4055,
"step": 2637
},
{
"epoch": 2.8416517055655297,
"grad_norm": 0.27274903655052185,
"learning_rate": 8.358268082753529e-08,
"loss": 0.3771,
"step": 2638
},
{
"epoch": 2.8427289048473967,
"grad_norm": 0.2588178813457489,
"learning_rate": 8.244477996253109e-08,
"loss": 0.3805,
"step": 2639
},
{
"epoch": 2.843806104129264,
"grad_norm": 0.2533237338066101,
"learning_rate": 8.13146136194265e-08,
"loss": 0.3515,
"step": 2640
},
{
"epoch": 2.844883303411131,
"grad_norm": 0.2688315212726593,
"learning_rate": 8.019218357579073e-08,
"loss": 0.4067,
"step": 2641
},
{
"epoch": 2.845960502692998,
"grad_norm": 0.2488754689693451,
"learning_rate": 7.907749159702549e-08,
"loss": 0.3264,
"step": 2642
},
{
"epoch": 2.847037701974865,
"grad_norm": 0.25835171341896057,
"learning_rate": 7.797053943636113e-08,
"loss": 0.3775,
"step": 2643
},
{
"epoch": 2.8481149012567326,
"grad_norm": 0.2600803077220917,
"learning_rate": 7.687132883485548e-08,
"loss": 0.3735,
"step": 2644
},
{
"epoch": 2.8491921005385996,
"grad_norm": 0.26888731122016907,
"learning_rate": 7.57798615213895e-08,
"loss": 0.3967,
"step": 2645
},
{
"epoch": 2.850269299820467,
"grad_norm": 0.2576195299625397,
"learning_rate": 7.46961392126655e-08,
"loss": 0.3677,
"step": 2646
},
{
"epoch": 2.851346499102334,
"grad_norm": 0.25903722643852234,
"learning_rate": 7.362016361320389e-08,
"loss": 0.374,
"step": 2647
},
{
"epoch": 2.852423698384201,
"grad_norm": 0.2514986991882324,
"learning_rate": 7.255193641534097e-08,
"loss": 0.3528,
"step": 2648
},
{
"epoch": 2.853500897666068,
"grad_norm": 0.26224809885025024,
"learning_rate": 7.149145929922607e-08,
"loss": 0.3463,
"step": 2649
},
{
"epoch": 2.8545780969479355,
"grad_norm": 0.2829777002334595,
"learning_rate": 7.043873393281831e-08,
"loss": 0.3867,
"step": 2650
},
{
"epoch": 2.8556552962298025,
"grad_norm": 0.27816978096961975,
"learning_rate": 6.939376197188652e-08,
"loss": 0.3638,
"step": 2651
},
{
"epoch": 2.8567324955116695,
"grad_norm": 0.27708882093429565,
"learning_rate": 6.835654506000101e-08,
"loss": 0.3958,
"step": 2652
},
{
"epoch": 2.857809694793537,
"grad_norm": 0.2444148063659668,
"learning_rate": 6.732708482853845e-08,
"loss": 0.3552,
"step": 2653
},
{
"epoch": 2.858886894075404,
"grad_norm": 0.25524911284446716,
"learning_rate": 6.630538289667365e-08,
"loss": 0.369,
"step": 2654
},
{
"epoch": 2.859964093357271,
"grad_norm": 0.28397855162620544,
"learning_rate": 6.52914408713784e-08,
"loss": 0.4338,
"step": 2655
},
{
"epoch": 2.861041292639138,
"grad_norm": 0.24332918226718903,
"learning_rate": 6.428526034742033e-08,
"loss": 0.3409,
"step": 2656
},
{
"epoch": 2.8621184919210054,
"grad_norm": 0.258176326751709,
"learning_rate": 6.328684290735965e-08,
"loss": 0.3791,
"step": 2657
},
{
"epoch": 2.8631956912028724,
"grad_norm": 0.29297754168510437,
"learning_rate": 6.229619012154575e-08,
"loss": 0.4531,
"step": 2658
},
{
"epoch": 2.86427289048474,
"grad_norm": 0.23918956518173218,
"learning_rate": 6.131330354811616e-08,
"loss": 0.3272,
"step": 2659
},
{
"epoch": 2.865350089766607,
"grad_norm": 0.274402379989624,
"learning_rate": 6.033818473299369e-08,
"loss": 0.3965,
"step": 2660
},
{
"epoch": 2.866427289048474,
"grad_norm": 0.2573271095752716,
"learning_rate": 5.9370835209881516e-08,
"loss": 0.3579,
"step": 2661
},
{
"epoch": 2.867504488330341,
"grad_norm": 0.26528796553611755,
"learning_rate": 5.8411256500265356e-08,
"loss": 0.3499,
"step": 2662
},
{
"epoch": 2.8685816876122083,
"grad_norm": 0.2704995274543762,
"learning_rate": 5.745945011340792e-08,
"loss": 0.3639,
"step": 2663
},
{
"epoch": 2.8696588868940753,
"grad_norm": 0.2413834184408188,
"learning_rate": 5.651541754634726e-08,
"loss": 0.3574,
"step": 2664
},
{
"epoch": 2.870736086175943,
"grad_norm": 0.2598327398300171,
"learning_rate": 5.557916028389454e-08,
"loss": 0.3721,
"step": 2665
},
{
"epoch": 2.87181328545781,
"grad_norm": 0.2590698301792145,
"learning_rate": 5.465067979863126e-08,
"loss": 0.3465,
"step": 2666
},
{
"epoch": 2.872890484739677,
"grad_norm": 0.26652219891548157,
"learning_rate": 5.372997755090758e-08,
"loss": 0.3741,
"step": 2667
},
{
"epoch": 2.873967684021544,
"grad_norm": 0.28296002745628357,
"learning_rate": 5.281705498884071e-08,
"loss": 0.418,
"step": 2668
},
{
"epoch": 2.8750448833034112,
"grad_norm": 0.2394905537366867,
"learning_rate": 5.1911913548309266e-08,
"loss": 0.3704,
"step": 2669
},
{
"epoch": 2.8761220825852782,
"grad_norm": 0.2725091576576233,
"learning_rate": 5.101455465295557e-08,
"loss": 0.372,
"step": 2670
},
{
"epoch": 2.8771992818671457,
"grad_norm": 0.27804869413375854,
"learning_rate": 5.0124979714181173e-08,
"loss": 0.3972,
"step": 2671
},
{
"epoch": 2.8782764811490127,
"grad_norm": 0.27069342136383057,
"learning_rate": 4.924319013114298e-08,
"loss": 0.3521,
"step": 2672
},
{
"epoch": 2.8793536804308797,
"grad_norm": 0.2562558948993683,
"learning_rate": 4.836918729075435e-08,
"loss": 0.3564,
"step": 2673
},
{
"epoch": 2.8804308797127467,
"grad_norm": 0.30231666564941406,
"learning_rate": 4.750297256768177e-08,
"loss": 0.4005,
"step": 2674
},
{
"epoch": 2.881508078994614,
"grad_norm": 0.26869866251945496,
"learning_rate": 4.664454732433987e-08,
"loss": 0.371,
"step": 2675
},
{
"epoch": 2.882585278276481,
"grad_norm": 0.25346073508262634,
"learning_rate": 4.579391291089419e-08,
"loss": 0.3183,
"step": 2676
},
{
"epoch": 2.883662477558348,
"grad_norm": 0.2803615629673004,
"learning_rate": 4.495107066525561e-08,
"loss": 0.4109,
"step": 2677
},
{
"epoch": 2.8847396768402156,
"grad_norm": 0.2551611363887787,
"learning_rate": 4.411602191307873e-08,
"loss": 0.3617,
"step": 2678
},
{
"epoch": 2.8858168761220826,
"grad_norm": 0.2504459023475647,
"learning_rate": 4.328876796776071e-08,
"loss": 0.3653,
"step": 2679
},
{
"epoch": 2.8868940754039496,
"grad_norm": 0.26026296615600586,
"learning_rate": 4.246931013043909e-08,
"loss": 0.3905,
"step": 2680
},
{
"epoch": 2.8879712746858166,
"grad_norm": 0.2713085412979126,
"learning_rate": 4.165764968998842e-08,
"loss": 0.3933,
"step": 2681
},
{
"epoch": 2.889048473967684,
"grad_norm": 0.2717359960079193,
"learning_rate": 4.0853787923020304e-08,
"loss": 0.3231,
"step": 2682
},
{
"epoch": 2.890125673249551,
"grad_norm": 0.28143253922462463,
"learning_rate": 4.0057726093880036e-08,
"loss": 0.3637,
"step": 2683
},
{
"epoch": 2.8912028725314185,
"grad_norm": 0.26846539974212646,
"learning_rate": 3.926946545464327e-08,
"loss": 0.3853,
"step": 2684
},
{
"epoch": 2.8922800718132855,
"grad_norm": 0.25803130865097046,
"learning_rate": 3.848900724511828e-08,
"loss": 0.3675,
"step": 2685
},
{
"epoch": 2.8933572710951525,
"grad_norm": 0.2654448449611664,
"learning_rate": 3.7716352692839796e-08,
"loss": 0.4028,
"step": 2686
},
{
"epoch": 2.8944344703770195,
"grad_norm": 0.2661076486110687,
"learning_rate": 3.6951503013067934e-08,
"loss": 0.3566,
"step": 2687
},
{
"epoch": 2.895511669658887,
"grad_norm": 0.2689468562602997,
"learning_rate": 3.6194459408789294e-08,
"loss": 0.348,
"step": 2688
},
{
"epoch": 2.896588868940754,
"grad_norm": 0.28917914628982544,
"learning_rate": 3.544522307071085e-08,
"loss": 0.386,
"step": 2689
},
{
"epoch": 2.8976660682226214,
"grad_norm": 0.2678031623363495,
"learning_rate": 3.4703795177260526e-08,
"loss": 0.3732,
"step": 2690
},
{
"epoch": 2.8987432675044884,
"grad_norm": 0.2567662298679352,
"learning_rate": 3.3970176894585485e-08,
"loss": 0.3477,
"step": 2691
},
{
"epoch": 2.8998204667863554,
"grad_norm": 0.279474675655365,
"learning_rate": 3.324436937654829e-08,
"loss": 0.3822,
"step": 2692
},
{
"epoch": 2.9008976660682224,
"grad_norm": 0.260716050863266,
"learning_rate": 3.252637376472745e-08,
"loss": 0.3616,
"step": 2693
},
{
"epoch": 2.90197486535009,
"grad_norm": 0.2681705057621002,
"learning_rate": 3.181619118841517e-08,
"loss": 0.3529,
"step": 2694
},
{
"epoch": 2.903052064631957,
"grad_norm": 0.2688203752040863,
"learning_rate": 3.111382276461294e-08,
"loss": 0.3403,
"step": 2695
},
{
"epoch": 2.9041292639138243,
"grad_norm": 0.2653854489326477,
"learning_rate": 3.041926959803376e-08,
"loss": 0.3706,
"step": 2696
},
{
"epoch": 2.9052064631956913,
"grad_norm": 0.2677724063396454,
"learning_rate": 2.9732532781097668e-08,
"loss": 0.3572,
"step": 2697
},
{
"epoch": 2.9062836624775583,
"grad_norm": 0.2722340226173401,
"learning_rate": 2.90536133939312e-08,
"loss": 0.3611,
"step": 2698
},
{
"epoch": 2.9073608617594253,
"grad_norm": 0.2826050817966461,
"learning_rate": 2.838251250436519e-08,
"loss": 0.3964,
"step": 2699
},
{
"epoch": 2.9084380610412928,
"grad_norm": 0.2590412199497223,
"learning_rate": 2.771923116793307e-08,
"loss": 0.3684,
"step": 2700
},
{
"epoch": 2.9095152603231598,
"grad_norm": 0.26464948058128357,
"learning_rate": 2.706377042786923e-08,
"loss": 0.3682,
"step": 2701
},
{
"epoch": 2.9105924596050268,
"grad_norm": 0.27499884366989136,
"learning_rate": 2.6416131315107895e-08,
"loss": 0.3724,
"step": 2702
},
{
"epoch": 2.911669658886894,
"grad_norm": 0.26174575090408325,
"learning_rate": 2.577631484828147e-08,
"loss": 0.3434,
"step": 2703
},
{
"epoch": 2.912746858168761,
"grad_norm": 0.2640121579170227,
"learning_rate": 2.5144322033717748e-08,
"loss": 0.4157,
"step": 2704
},
{
"epoch": 2.9138240574506282,
"grad_norm": 0.2796087861061096,
"learning_rate": 2.4520153865439377e-08,
"loss": 0.3622,
"step": 2705
},
{
"epoch": 2.9149012567324957,
"grad_norm": 0.26384004950523376,
"learning_rate": 2.3903811325163285e-08,
"loss": 0.36,
"step": 2706
},
{
"epoch": 2.9159784560143627,
"grad_norm": 0.2627621293067932,
"learning_rate": 2.329529538229569e-08,
"loss": 0.391,
"step": 2707
},
{
"epoch": 2.9170556552962297,
"grad_norm": 0.2715895175933838,
"learning_rate": 2.2694606993934886e-08,
"loss": 0.3764,
"step": 2708
},
{
"epoch": 2.918132854578097,
"grad_norm": 0.2541747987270355,
"learning_rate": 2.210174710486679e-08,
"loss": 0.3667,
"step": 2709
},
{
"epoch": 2.919210053859964,
"grad_norm": 0.26594072580337524,
"learning_rate": 2.1516716647564383e-08,
"loss": 0.3646,
"step": 2710
},
{
"epoch": 2.920287253141831,
"grad_norm": 0.2747040092945099,
"learning_rate": 2.0939516542186066e-08,
"loss": 0.3754,
"step": 2711
},
{
"epoch": 2.921364452423698,
"grad_norm": 0.2700275182723999,
"learning_rate": 2.0370147696574528e-08,
"loss": 0.3537,
"step": 2712
},
{
"epoch": 2.9224416517055656,
"grad_norm": 0.2625963091850281,
"learning_rate": 1.9808611006256196e-08,
"loss": 0.3662,
"step": 2713
},
{
"epoch": 2.9235188509874326,
"grad_norm": 0.2688051164150238,
"learning_rate": 1.9254907354436804e-08,
"loss": 0.4109,
"step": 2714
},
{
"epoch": 2.9245960502693,
"grad_norm": 0.26145994663238525,
"learning_rate": 1.8709037612003044e-08,
"loss": 0.3506,
"step": 2715
},
{
"epoch": 2.925673249551167,
"grad_norm": 0.2681027054786682,
"learning_rate": 1.8171002637520362e-08,
"loss": 0.3838,
"step": 2716
},
{
"epoch": 2.926750448833034,
"grad_norm": 0.26508158445358276,
"learning_rate": 1.764080327723128e-08,
"loss": 0.3774,
"step": 2717
},
{
"epoch": 2.927827648114901,
"grad_norm": 0.2793397605419159,
"learning_rate": 1.7118440365053723e-08,
"loss": 0.3811,
"step": 2718
},
{
"epoch": 2.9289048473967685,
"grad_norm": 0.2447550743818283,
"learning_rate": 1.6603914722579938e-08,
"loss": 0.3566,
"step": 2719
},
{
"epoch": 2.9299820466786355,
"grad_norm": 0.26184481382369995,
"learning_rate": 1.6097227159075912e-08,
"loss": 0.3635,
"step": 2720
},
{
"epoch": 2.931059245960503,
"grad_norm": 0.2621864974498749,
"learning_rate": 1.559837847148027e-08,
"loss": 0.3668,
"step": 2721
},
{
"epoch": 2.93213644524237,
"grad_norm": 0.26011908054351807,
"learning_rate": 1.51073694444015e-08,
"loss": 0.3523,
"step": 2722
},
{
"epoch": 2.933213644524237,
"grad_norm": 0.2705157399177551,
"learning_rate": 1.4624200850116844e-08,
"loss": 0.4118,
"step": 2723
},
{
"epoch": 2.934290843806104,
"grad_norm": 0.2857845723628998,
"learning_rate": 1.4148873448573408e-08,
"loss": 0.4051,
"step": 2724
},
{
"epoch": 2.9353680430879714,
"grad_norm": 0.2570638060569763,
"learning_rate": 1.368138798738372e-08,
"loss": 0.3463,
"step": 2725
},
{
"epoch": 2.9364452423698384,
"grad_norm": 0.26908525824546814,
"learning_rate": 1.3221745201828507e-08,
"loss": 0.406,
"step": 2726
},
{
"epoch": 2.937522441651706,
"grad_norm": 0.26163509488105774,
"learning_rate": 1.2769945814850582e-08,
"loss": 0.3665,
"step": 2727
},
{
"epoch": 2.938599640933573,
"grad_norm": 0.26050251722335815,
"learning_rate": 1.2325990537057631e-08,
"loss": 0.3469,
"step": 2728
},
{
"epoch": 2.93967684021544,
"grad_norm": 0.25835105776786804,
"learning_rate": 1.1889880066720538e-08,
"loss": 0.3636,
"step": 2729
},
{
"epoch": 2.940754039497307,
"grad_norm": 0.2773009240627289,
"learning_rate": 1.1461615089770062e-08,
"loss": 0.4345,
"step": 2730
},
{
"epoch": 2.9418312387791743,
"grad_norm": 0.272200345993042,
"learning_rate": 1.1041196279798493e-08,
"loss": 0.3362,
"step": 2731
},
{
"epoch": 2.9429084380610413,
"grad_norm": 0.2865847647190094,
"learning_rate": 1.0628624298056888e-08,
"loss": 0.3953,
"step": 2732
},
{
"epoch": 2.9439856373429083,
"grad_norm": 0.25535234808921814,
"learning_rate": 1.0223899793453951e-08,
"loss": 0.3538,
"step": 2733
},
{
"epoch": 2.9450628366247757,
"grad_norm": 0.275097519159317,
"learning_rate": 9.827023402556035e-09,
"loss": 0.3907,
"step": 2734
},
{
"epoch": 2.9461400359066428,
"grad_norm": 0.27777495980262756,
"learning_rate": 9.437995749586593e-09,
"loss": 0.3304,
"step": 2735
},
{
"epoch": 2.9472172351885098,
"grad_norm": 0.297230988740921,
"learning_rate": 9.05681744642284e-09,
"loss": 0.3531,
"step": 2736
},
{
"epoch": 2.9482944344703768,
"grad_norm": 0.26701468229293823,
"learning_rate": 8.68348909259742e-09,
"loss": 0.3728,
"step": 2737
},
{
"epoch": 2.949371633752244,
"grad_norm": 0.2648324966430664,
"learning_rate": 8.318011275294525e-09,
"loss": 0.3594,
"step": 2738
},
{
"epoch": 2.950448833034111,
"grad_norm": 0.2587242126464844,
"learning_rate": 7.96038456935322e-09,
"loss": 0.3478,
"step": 2739
},
{
"epoch": 2.9515260323159787,
"grad_norm": 0.25219130516052246,
"learning_rate": 7.610609537261337e-09,
"loss": 0.3576,
"step": 2740
},
{
"epoch": 2.9526032315978457,
"grad_norm": 0.2940714657306671,
"learning_rate": 7.268686729159369e-09,
"loss": 0.3884,
"step": 2741
},
{
"epoch": 2.9536804308797127,
"grad_norm": 0.2683965265750885,
"learning_rate": 6.9346166828371256e-09,
"loss": 0.3951,
"step": 2742
},
{
"epoch": 2.9547576301615797,
"grad_norm": 0.2479734867811203,
"learning_rate": 6.6083999237320786e-09,
"loss": 0.3471,
"step": 2743
},
{
"epoch": 2.955834829443447,
"grad_norm": 0.2499818354845047,
"learning_rate": 6.2900369649315785e-09,
"loss": 0.3435,
"step": 2744
},
{
"epoch": 2.956912028725314,
"grad_norm": 0.2655635178089142,
"learning_rate": 5.979528307168414e-09,
"loss": 0.3973,
"step": 2745
},
{
"epoch": 2.9579892280071816,
"grad_norm": 0.2900417745113373,
"learning_rate": 5.676874438823032e-09,
"loss": 0.3786,
"step": 2746
},
{
"epoch": 2.9590664272890486,
"grad_norm": 0.25159966945648193,
"learning_rate": 5.382075835921319e-09,
"loss": 0.3546,
"step": 2747
},
{
"epoch": 2.9601436265709156,
"grad_norm": 0.284942626953125,
"learning_rate": 5.095132962134042e-09,
"loss": 0.4079,
"step": 2748
},
{
"epoch": 2.9612208258527826,
"grad_norm": 0.2491726577281952,
"learning_rate": 4.816046268775742e-09,
"loss": 0.3631,
"step": 2749
},
{
"epoch": 2.96229802513465,
"grad_norm": 0.2669207453727722,
"learning_rate": 4.5448161948047355e-09,
"loss": 0.3941,
"step": 2750
},
{
"epoch": 2.963375224416517,
"grad_norm": 0.26818183064460754,
"learning_rate": 4.281443166822552e-09,
"loss": 0.3969,
"step": 2751
},
{
"epoch": 2.9644524236983845,
"grad_norm": 0.25657200813293457,
"learning_rate": 4.0259275990722764e-09,
"loss": 0.3578,
"step": 2752
},
{
"epoch": 2.9655296229802515,
"grad_norm": 0.2637154161930084,
"learning_rate": 3.778269893439101e-09,
"loss": 0.3702,
"step": 2753
},
{
"epoch": 2.9666068222621185,
"grad_norm": 0.2604050636291504,
"learning_rate": 3.538470439448105e-09,
"loss": 0.3806,
"step": 2754
},
{
"epoch": 2.9676840215439855,
"grad_norm": 0.264647901058197,
"learning_rate": 3.3065296142659188e-09,
"loss": 0.3662,
"step": 2755
},
{
"epoch": 2.968761220825853,
"grad_norm": 0.26899972558021545,
"learning_rate": 3.0824477826979504e-09,
"loss": 0.3555,
"step": 2756
},
{
"epoch": 2.96983842010772,
"grad_norm": 0.2733704149723053,
"learning_rate": 2.86622529718783e-09,
"loss": 0.4007,
"step": 2757
},
{
"epoch": 2.970915619389587,
"grad_norm": 0.25085026025772095,
"learning_rate": 2.657862497820185e-09,
"loss": 0.346,
"step": 2758
},
{
"epoch": 2.9719928186714544,
"grad_norm": 0.2898995280265808,
"learning_rate": 2.4573597123145333e-09,
"loss": 0.3975,
"step": 2759
},
{
"epoch": 2.9730700179533214,
"grad_norm": 0.255732923746109,
"learning_rate": 2.264717256030835e-09,
"loss": 0.4053,
"step": 2760
},
{
"epoch": 2.9741472172351884,
"grad_norm": 0.24689188599586487,
"learning_rate": 2.079935431963942e-09,
"loss": 0.365,
"step": 2761
},
{
"epoch": 2.9752244165170554,
"grad_norm": 0.2516534924507141,
"learning_rate": 1.903014530745817e-09,
"loss": 0.3502,
"step": 2762
},
{
"epoch": 2.976301615798923,
"grad_norm": 0.280439168214798,
"learning_rate": 1.7339548306449794e-09,
"loss": 0.4066,
"step": 2763
},
{
"epoch": 2.97737881508079,
"grad_norm": 0.2619931101799011,
"learning_rate": 1.5727565975642844e-09,
"loss": 0.3337,
"step": 2764
},
{
"epoch": 2.9784560143626573,
"grad_norm": 0.2771995961666107,
"learning_rate": 1.419420085043699e-09,
"loss": 0.4036,
"step": 2765
},
{
"epoch": 2.9795332136445243,
"grad_norm": 0.2701093852519989,
"learning_rate": 1.2739455342558605e-09,
"loss": 0.406,
"step": 2766
},
{
"epoch": 2.9806104129263913,
"grad_norm": 0.2730431854724884,
"learning_rate": 1.1363331740094075e-09,
"loss": 0.3554,
"step": 2767
},
{
"epoch": 2.9816876122082583,
"grad_norm": 0.2705381512641907,
"learning_rate": 1.0065832207462045e-09,
"loss": 0.3695,
"step": 2768
},
{
"epoch": 2.9827648114901257,
"grad_norm": 0.2739293575286865,
"learning_rate": 8.846958785418969e-10,
"loss": 0.3781,
"step": 2769
},
{
"epoch": 2.9838420107719927,
"grad_norm": 0.2708894610404968,
"learning_rate": 7.706713391059107e-10,
"loss": 0.37,
"step": 2770
},
{
"epoch": 2.98491921005386,
"grad_norm": 0.26479482650756836,
"learning_rate": 6.645097817797874e-10,
"loss": 0.3348,
"step": 2771
},
{
"epoch": 2.985996409335727,
"grad_norm": 0.26794207096099854,
"learning_rate": 5.662113735394048e-10,
"loss": 0.3752,
"step": 2772
},
{
"epoch": 2.987073608617594,
"grad_norm": 0.25619226694107056,
"learning_rate": 4.757762689922008e-10,
"loss": 0.3704,
"step": 2773
},
{
"epoch": 2.988150807899461,
"grad_norm": 0.28731483221054077,
"learning_rate": 3.9320461037772873e-10,
"loss": 0.4023,
"step": 2774
},
{
"epoch": 2.9892280071813286,
"grad_norm": 0.280960351228714,
"learning_rate": 3.184965275676577e-10,
"loss": 0.4169,
"step": 2775
},
{
"epoch": 2.9903052064631956,
"grad_norm": 0.26428189873695374,
"learning_rate": 2.5165213806632726e-10,
"loss": 0.3637,
"step": 2776
},
{
"epoch": 2.991382405745063,
"grad_norm": 0.2421150952577591,
"learning_rate": 1.926715470090823e-10,
"loss": 0.3619,
"step": 2777
},
{
"epoch": 2.99245960502693,
"grad_norm": 0.2729114592075348,
"learning_rate": 1.4155484716227296e-10,
"loss": 0.3948,
"step": 2778
},
{
"epoch": 2.993536804308797,
"grad_norm": 0.2836900055408478,
"learning_rate": 9.830211892492004e-11,
"loss": 0.3908,
"step": 2779
},
{
"epoch": 2.994614003590664,
"grad_norm": 0.2800293266773224,
"learning_rate": 6.291343032649444e-11,
"loss": 0.37,
"step": 2780
},
{
"epoch": 2.9956912028725315,
"grad_norm": 0.2719917595386505,
"learning_rate": 3.538883702747242e-11,
"loss": 0.3686,
"step": 2781
},
{
"epoch": 2.9967684021543985,
"grad_norm": 0.27564868330955505,
"learning_rate": 1.5728382319890602e-11,
"loss": 0.3806,
"step": 2782
},
{
"epoch": 2.9978456014362656,
"grad_norm": 0.28030478954315186,
"learning_rate": 3.9320971262357676e-12,
"loss": 0.3715,
"step": 2783
},
{
"epoch": 2.998922800718133,
"grad_norm": 0.25694915652275085,
"learning_rate": 0.0,
"loss": 0.3591,
"step": 2784
},
{
"epoch": 2.998922800718133,
"step": 2784,
"total_flos": 4113291007754240.0,
"train_loss": 0.41809357445815515,
"train_runtime": 59036.9055,
"train_samples_per_second": 4.529,
"train_steps_per_second": 0.047
}
],
"logging_steps": 1.0,
"max_steps": 2784,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4113291007754240.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}