tangken333's picture
End of training
6f6439f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.987241531016278,
"eval_steps": 500,
"global_step": 567,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005279366476022877,
"grad_norm": 9.06258192697587,
"learning_rate": 0.0,
"loss": 1.678,
"step": 1
},
{
"epoch": 0.010558732952045754,
"grad_norm": 9.487470045686269,
"learning_rate": 1.7543859649122808e-07,
"loss": 1.8487,
"step": 2
},
{
"epoch": 0.01583809942806863,
"grad_norm": 8.792837550815417,
"learning_rate": 3.5087719298245616e-07,
"loss": 1.6855,
"step": 3
},
{
"epoch": 0.02111746590409151,
"grad_norm": 9.323941973098307,
"learning_rate": 5.263157894736843e-07,
"loss": 1.79,
"step": 4
},
{
"epoch": 0.026396832380114386,
"grad_norm": 8.5661762778806,
"learning_rate": 7.017543859649123e-07,
"loss": 1.6796,
"step": 5
},
{
"epoch": 0.03167619885613726,
"grad_norm": 8.780182073885886,
"learning_rate": 8.771929824561404e-07,
"loss": 1.8326,
"step": 6
},
{
"epoch": 0.03695556533216014,
"grad_norm": 8.658784025795836,
"learning_rate": 1.0526315789473685e-06,
"loss": 1.7229,
"step": 7
},
{
"epoch": 0.04223493180818302,
"grad_norm": 8.678284010844528,
"learning_rate": 1.2280701754385965e-06,
"loss": 1.782,
"step": 8
},
{
"epoch": 0.0475142982842059,
"grad_norm": 7.4972200445720425,
"learning_rate": 1.4035087719298246e-06,
"loss": 1.6596,
"step": 9
},
{
"epoch": 0.05279366476022877,
"grad_norm": 8.204566912821951,
"learning_rate": 1.5789473684210526e-06,
"loss": 1.6707,
"step": 10
},
{
"epoch": 0.05807303123625165,
"grad_norm": 7.461093116720939,
"learning_rate": 1.7543859649122807e-06,
"loss": 1.718,
"step": 11
},
{
"epoch": 0.06335239771227452,
"grad_norm": 6.113172095963636,
"learning_rate": 1.929824561403509e-06,
"loss": 1.5939,
"step": 12
},
{
"epoch": 0.0686317641882974,
"grad_norm": 5.4039227506301835,
"learning_rate": 2.105263157894737e-06,
"loss": 1.497,
"step": 13
},
{
"epoch": 0.07391113066432028,
"grad_norm": 5.072244053671971,
"learning_rate": 2.280701754385965e-06,
"loss": 1.4584,
"step": 14
},
{
"epoch": 0.07919049714034315,
"grad_norm": 4.290896240507921,
"learning_rate": 2.456140350877193e-06,
"loss": 1.4996,
"step": 15
},
{
"epoch": 0.08446986361636603,
"grad_norm": 3.563114688671191,
"learning_rate": 2.631578947368421e-06,
"loss": 1.4042,
"step": 16
},
{
"epoch": 0.08974923009238892,
"grad_norm": 3.598254571007457,
"learning_rate": 2.8070175438596493e-06,
"loss": 1.4125,
"step": 17
},
{
"epoch": 0.0950285965684118,
"grad_norm": 3.6215074072359865,
"learning_rate": 2.9824561403508774e-06,
"loss": 1.4723,
"step": 18
},
{
"epoch": 0.10030796304443466,
"grad_norm": 2.915485138222026,
"learning_rate": 3.157894736842105e-06,
"loss": 1.2987,
"step": 19
},
{
"epoch": 0.10558732952045755,
"grad_norm": 2.4634215162994595,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.2651,
"step": 20
},
{
"epoch": 0.11086669599648043,
"grad_norm": 2.3107374113003103,
"learning_rate": 3.5087719298245615e-06,
"loss": 1.2512,
"step": 21
},
{
"epoch": 0.1161460624725033,
"grad_norm": 2.7621133107185707,
"learning_rate": 3.6842105263157896e-06,
"loss": 1.3191,
"step": 22
},
{
"epoch": 0.12142542894852618,
"grad_norm": 2.877480148788483,
"learning_rate": 3.859649122807018e-06,
"loss": 1.2354,
"step": 23
},
{
"epoch": 0.12670479542454904,
"grad_norm": 2.329645780694447,
"learning_rate": 4.035087719298246e-06,
"loss": 1.174,
"step": 24
},
{
"epoch": 0.13198416190057194,
"grad_norm": 2.692865294147214,
"learning_rate": 4.210526315789474e-06,
"loss": 1.2824,
"step": 25
},
{
"epoch": 0.1372635283765948,
"grad_norm": 2.2247599668267615,
"learning_rate": 4.385964912280702e-06,
"loss": 1.2023,
"step": 26
},
{
"epoch": 0.14254289485261767,
"grad_norm": 2.038589631126849,
"learning_rate": 4.56140350877193e-06,
"loss": 1.1412,
"step": 27
},
{
"epoch": 0.14782226132864057,
"grad_norm": 2.0910541565756002,
"learning_rate": 4.736842105263158e-06,
"loss": 1.1989,
"step": 28
},
{
"epoch": 0.15310162780466344,
"grad_norm": 1.7517601173871857,
"learning_rate": 4.912280701754386e-06,
"loss": 1.0738,
"step": 29
},
{
"epoch": 0.1583809942806863,
"grad_norm": 1.6413195717279263,
"learning_rate": 5.087719298245615e-06,
"loss": 1.0853,
"step": 30
},
{
"epoch": 0.1636603607567092,
"grad_norm": 1.688876231888243,
"learning_rate": 5.263157894736842e-06,
"loss": 1.061,
"step": 31
},
{
"epoch": 0.16893972723273207,
"grad_norm": 1.5670006557231517,
"learning_rate": 5.438596491228071e-06,
"loss": 1.1496,
"step": 32
},
{
"epoch": 0.17421909370875496,
"grad_norm": 1.7072674079415744,
"learning_rate": 5.6140350877192985e-06,
"loss": 1.1424,
"step": 33
},
{
"epoch": 0.17949846018477783,
"grad_norm": 1.6307968487984736,
"learning_rate": 5.789473684210527e-06,
"loss": 1.0655,
"step": 34
},
{
"epoch": 0.1847778266608007,
"grad_norm": 1.6358139036334913,
"learning_rate": 5.964912280701755e-06,
"loss": 1.0241,
"step": 35
},
{
"epoch": 0.1900571931368236,
"grad_norm": 1.4688899759958145,
"learning_rate": 6.140350877192983e-06,
"loss": 1.0076,
"step": 36
},
{
"epoch": 0.19533655961284646,
"grad_norm": 1.5254490097810096,
"learning_rate": 6.31578947368421e-06,
"loss": 1.0991,
"step": 37
},
{
"epoch": 0.20061592608886933,
"grad_norm": 1.5005708253602146,
"learning_rate": 6.491228070175439e-06,
"loss": 1.0824,
"step": 38
},
{
"epoch": 0.20589529256489222,
"grad_norm": 1.420109858670133,
"learning_rate": 6.666666666666667e-06,
"loss": 0.994,
"step": 39
},
{
"epoch": 0.2111746590409151,
"grad_norm": 1.4491053784050478,
"learning_rate": 6.842105263157896e-06,
"loss": 1.0583,
"step": 40
},
{
"epoch": 0.21645402551693796,
"grad_norm": 1.448588096978498,
"learning_rate": 7.017543859649123e-06,
"loss": 0.9892,
"step": 41
},
{
"epoch": 0.22173339199296085,
"grad_norm": 1.4526199178661001,
"learning_rate": 7.192982456140352e-06,
"loss": 1.0051,
"step": 42
},
{
"epoch": 0.22701275846898372,
"grad_norm": 1.4693112027438093,
"learning_rate": 7.368421052631579e-06,
"loss": 0.9833,
"step": 43
},
{
"epoch": 0.2322921249450066,
"grad_norm": 1.3739429688061344,
"learning_rate": 7.5438596491228074e-06,
"loss": 0.9793,
"step": 44
},
{
"epoch": 0.23757149142102948,
"grad_norm": 1.4247570212621312,
"learning_rate": 7.719298245614036e-06,
"loss": 0.9366,
"step": 45
},
{
"epoch": 0.24285085789705235,
"grad_norm": 1.2987600676752833,
"learning_rate": 7.894736842105265e-06,
"loss": 0.9006,
"step": 46
},
{
"epoch": 0.24813022437307522,
"grad_norm": 1.5006141932812773,
"learning_rate": 8.070175438596492e-06,
"loss": 0.9785,
"step": 47
},
{
"epoch": 0.2534095908490981,
"grad_norm": 1.468806773441352,
"learning_rate": 8.24561403508772e-06,
"loss": 0.9941,
"step": 48
},
{
"epoch": 0.258688957325121,
"grad_norm": 1.3084024864816892,
"learning_rate": 8.421052631578948e-06,
"loss": 0.938,
"step": 49
},
{
"epoch": 0.2639683238011439,
"grad_norm": 1.4542464523472705,
"learning_rate": 8.596491228070176e-06,
"loss": 1.009,
"step": 50
},
{
"epoch": 0.2692476902771667,
"grad_norm": 1.3097725384782457,
"learning_rate": 8.771929824561405e-06,
"loss": 0.9557,
"step": 51
},
{
"epoch": 0.2745270567531896,
"grad_norm": 1.3424191425040415,
"learning_rate": 8.947368421052632e-06,
"loss": 0.9773,
"step": 52
},
{
"epoch": 0.2798064232292125,
"grad_norm": 1.3120126450980685,
"learning_rate": 9.12280701754386e-06,
"loss": 0.979,
"step": 53
},
{
"epoch": 0.28508578970523535,
"grad_norm": 1.4326002194117426,
"learning_rate": 9.298245614035088e-06,
"loss": 0.9909,
"step": 54
},
{
"epoch": 0.29036515618125824,
"grad_norm": 1.3411895985410853,
"learning_rate": 9.473684210526315e-06,
"loss": 0.9343,
"step": 55
},
{
"epoch": 0.29564452265728114,
"grad_norm": 1.4092777694950307,
"learning_rate": 9.649122807017545e-06,
"loss": 0.9711,
"step": 56
},
{
"epoch": 0.300923889133304,
"grad_norm": 1.3183008246582104,
"learning_rate": 9.824561403508772e-06,
"loss": 1.0189,
"step": 57
},
{
"epoch": 0.3062032556093269,
"grad_norm": 1.4351454680178788,
"learning_rate": 1e-05,
"loss": 0.9581,
"step": 58
},
{
"epoch": 0.31148262208534977,
"grad_norm": 1.4327154249987661,
"learning_rate": 9.999905136743635e-06,
"loss": 1.0108,
"step": 59
},
{
"epoch": 0.3167619885613726,
"grad_norm": 1.2783140310829089,
"learning_rate": 9.999620550574155e-06,
"loss": 0.8755,
"step": 60
},
{
"epoch": 0.3220413550373955,
"grad_norm": 1.3286747156393754,
"learning_rate": 9.999146252290264e-06,
"loss": 0.9038,
"step": 61
},
{
"epoch": 0.3273207215134184,
"grad_norm": 1.332121876828825,
"learning_rate": 9.99848225988936e-06,
"loss": 0.9326,
"step": 62
},
{
"epoch": 0.33260008798944124,
"grad_norm": 1.279293629368409,
"learning_rate": 9.99762859856683e-06,
"loss": 0.8677,
"step": 63
},
{
"epoch": 0.33787945446546414,
"grad_norm": 1.273872943436948,
"learning_rate": 9.996585300715117e-06,
"loss": 0.9299,
"step": 64
},
{
"epoch": 0.34315882094148703,
"grad_norm": 1.2562758551505886,
"learning_rate": 9.995352405922467e-06,
"loss": 0.9017,
"step": 65
},
{
"epoch": 0.3484381874175099,
"grad_norm": 1.4032802781936289,
"learning_rate": 9.99392996097145e-06,
"loss": 0.9452,
"step": 66
},
{
"epoch": 0.35371755389353277,
"grad_norm": 1.4699498993510196,
"learning_rate": 9.992318019837171e-06,
"loss": 1.0884,
"step": 67
},
{
"epoch": 0.35899692036955566,
"grad_norm": 1.2352131609899204,
"learning_rate": 9.990516643685222e-06,
"loss": 0.9171,
"step": 68
},
{
"epoch": 0.36427628684557856,
"grad_norm": 1.3192425912595864,
"learning_rate": 9.988525900869366e-06,
"loss": 0.9075,
"step": 69
},
{
"epoch": 0.3695556533216014,
"grad_norm": 1.2967022211363584,
"learning_rate": 9.98634586692894e-06,
"loss": 0.885,
"step": 70
},
{
"epoch": 0.3748350197976243,
"grad_norm": 1.3926263384486055,
"learning_rate": 9.983976624585996e-06,
"loss": 0.9215,
"step": 71
},
{
"epoch": 0.3801143862736472,
"grad_norm": 1.2279876199917352,
"learning_rate": 9.981418263742148e-06,
"loss": 0.9121,
"step": 72
},
{
"epoch": 0.38539375274967,
"grad_norm": 1.3296649614901273,
"learning_rate": 9.978670881475173e-06,
"loss": 0.8719,
"step": 73
},
{
"epoch": 0.3906731192256929,
"grad_norm": 1.4366980284008448,
"learning_rate": 9.975734582035323e-06,
"loss": 0.9158,
"step": 74
},
{
"epoch": 0.3959524857017158,
"grad_norm": 1.4161336324128757,
"learning_rate": 9.972609476841368e-06,
"loss": 0.9519,
"step": 75
},
{
"epoch": 0.40123185217773866,
"grad_norm": 1.2765268199027533,
"learning_rate": 9.96929568447637e-06,
"loss": 0.8892,
"step": 76
},
{
"epoch": 0.40651121865376155,
"grad_norm": 1.2903936251174384,
"learning_rate": 9.965793330683182e-06,
"loss": 0.8749,
"step": 77
},
{
"epoch": 0.41179058512978445,
"grad_norm": 1.328926357616079,
"learning_rate": 9.96210254835968e-06,
"loss": 0.8218,
"step": 78
},
{
"epoch": 0.4170699516058073,
"grad_norm": 1.192377478319799,
"learning_rate": 9.958223477553715e-06,
"loss": 0.8215,
"step": 79
},
{
"epoch": 0.4223493180818302,
"grad_norm": 1.2953681293753618,
"learning_rate": 9.954156265457801e-06,
"loss": 0.8103,
"step": 80
},
{
"epoch": 0.4276286845578531,
"grad_norm": 1.3072992620223463,
"learning_rate": 9.949901066403536e-06,
"loss": 0.8992,
"step": 81
},
{
"epoch": 0.4329080510338759,
"grad_norm": 1.2374067876062618,
"learning_rate": 9.945458041855732e-06,
"loss": 0.8831,
"step": 82
},
{
"epoch": 0.4381874175098988,
"grad_norm": 1.3974745936753212,
"learning_rate": 9.940827360406297e-06,
"loss": 0.8843,
"step": 83
},
{
"epoch": 0.4434667839859217,
"grad_norm": 1.2232655465327642,
"learning_rate": 9.936009197767847e-06,
"loss": 0.8714,
"step": 84
},
{
"epoch": 0.44874615046194455,
"grad_norm": 1.3329454900801005,
"learning_rate": 9.931003736767013e-06,
"loss": 0.9267,
"step": 85
},
{
"epoch": 0.45402551693796744,
"grad_norm": 1.2551849569644793,
"learning_rate": 9.925811167337533e-06,
"loss": 0.8765,
"step": 86
},
{
"epoch": 0.45930488341399034,
"grad_norm": 1.324099190921721,
"learning_rate": 9.920431686513023e-06,
"loss": 0.8835,
"step": 87
},
{
"epoch": 0.4645842498900132,
"grad_norm": 1.2343457957658,
"learning_rate": 9.91486549841951e-06,
"loss": 0.8557,
"step": 88
},
{
"epoch": 0.4698636163660361,
"grad_norm": 1.357007540882203,
"learning_rate": 9.909112814267686e-06,
"loss": 0.93,
"step": 89
},
{
"epoch": 0.47514298284205897,
"grad_norm": 1.2685007078756392,
"learning_rate": 9.903173852344889e-06,
"loss": 0.8493,
"step": 90
},
{
"epoch": 0.4804223493180818,
"grad_norm": 1.3198835038669643,
"learning_rate": 9.89704883800683e-06,
"loss": 0.9375,
"step": 91
},
{
"epoch": 0.4857017157941047,
"grad_norm": 1.2921827299558808,
"learning_rate": 9.890738003669029e-06,
"loss": 0.8502,
"step": 92
},
{
"epoch": 0.4909810822701276,
"grad_norm": 1.3792411082761877,
"learning_rate": 9.884241588798004e-06,
"loss": 0.8722,
"step": 93
},
{
"epoch": 0.49626044874615044,
"grad_norm": 1.3853454707822626,
"learning_rate": 9.877559839902185e-06,
"loss": 0.9781,
"step": 94
},
{
"epoch": 0.5015398152221734,
"grad_norm": 1.4954228020473375,
"learning_rate": 9.870693010522552e-06,
"loss": 0.9494,
"step": 95
},
{
"epoch": 0.5068191816981962,
"grad_norm": 1.4537142950730755,
"learning_rate": 9.863641361223025e-06,
"loss": 0.8948,
"step": 96
},
{
"epoch": 0.5120985481742191,
"grad_norm": 1.3353911864277976,
"learning_rate": 9.85640515958057e-06,
"loss": 0.9305,
"step": 97
},
{
"epoch": 0.517377914650242,
"grad_norm": 1.3069189595420478,
"learning_rate": 9.848984680175049e-06,
"loss": 1.0241,
"step": 98
},
{
"epoch": 0.5226572811262649,
"grad_norm": 1.4160480924828072,
"learning_rate": 9.841380204578795e-06,
"loss": 0.8869,
"step": 99
},
{
"epoch": 0.5279366476022878,
"grad_norm": 1.6379023507410095,
"learning_rate": 9.833592021345938e-06,
"loss": 0.8854,
"step": 100
},
{
"epoch": 0.5332160140783107,
"grad_norm": 1.4984036893204795,
"learning_rate": 9.825620426001446e-06,
"loss": 0.9243,
"step": 101
},
{
"epoch": 0.5384953805543334,
"grad_norm": 1.3234999163374,
"learning_rate": 9.817465721029916e-06,
"loss": 0.8645,
"step": 102
},
{
"epoch": 0.5437747470303563,
"grad_norm": 1.3367013313050653,
"learning_rate": 9.809128215864096e-06,
"loss": 0.9064,
"step": 103
},
{
"epoch": 0.5490541135063792,
"grad_norm": 1.2888959675698541,
"learning_rate": 9.800608226873143e-06,
"loss": 0.8828,
"step": 104
},
{
"epoch": 0.5543334799824021,
"grad_norm": 1.3417129361301388,
"learning_rate": 9.791906077350613e-06,
"loss": 0.8687,
"step": 105
},
{
"epoch": 0.559612846458425,
"grad_norm": 1.387011742779051,
"learning_rate": 9.783022097502204e-06,
"loss": 1.0081,
"step": 106
},
{
"epoch": 0.5648922129344479,
"grad_norm": 1.2844930485660748,
"learning_rate": 9.773956624433224e-06,
"loss": 0.8988,
"step": 107
},
{
"epoch": 0.5701715794104707,
"grad_norm": 1.362695293650949,
"learning_rate": 9.764710002135784e-06,
"loss": 0.8097,
"step": 108
},
{
"epoch": 0.5754509458864936,
"grad_norm": 1.311448593659273,
"learning_rate": 9.755282581475769e-06,
"loss": 0.8707,
"step": 109
},
{
"epoch": 0.5807303123625165,
"grad_norm": 1.4708655509314155,
"learning_rate": 9.745674720179507e-06,
"loss": 0.913,
"step": 110
},
{
"epoch": 0.5860096788385394,
"grad_norm": 1.3142546545330323,
"learning_rate": 9.735886782820202e-06,
"loss": 0.8495,
"step": 111
},
{
"epoch": 0.5912890453145623,
"grad_norm": 1.2448969618431651,
"learning_rate": 9.7259191408041e-06,
"loss": 0.7694,
"step": 112
},
{
"epoch": 0.5965684117905852,
"grad_norm": 1.2817154139311548,
"learning_rate": 9.715772172356388e-06,
"loss": 0.8874,
"step": 113
},
{
"epoch": 0.601847778266608,
"grad_norm": 1.303527565876824,
"learning_rate": 9.705446262506858e-06,
"loss": 0.9887,
"step": 114
},
{
"epoch": 0.6071271447426309,
"grad_norm": 1.3162139512016877,
"learning_rate": 9.694941803075285e-06,
"loss": 0.8731,
"step": 115
},
{
"epoch": 0.6124065112186537,
"grad_norm": 1.3631926446929201,
"learning_rate": 9.684259192656554e-06,
"loss": 0.8773,
"step": 116
},
{
"epoch": 0.6176858776946766,
"grad_norm": 1.354454946440946,
"learning_rate": 9.673398836605554e-06,
"loss": 0.9415,
"step": 117
},
{
"epoch": 0.6229652441706995,
"grad_norm": 1.2806247492822374,
"learning_rate": 9.66236114702178e-06,
"loss": 0.8214,
"step": 118
},
{
"epoch": 0.6282446106467224,
"grad_norm": 1.3067756329426423,
"learning_rate": 9.651146542733702e-06,
"loss": 0.9561,
"step": 119
},
{
"epoch": 0.6335239771227452,
"grad_norm": 1.3982461215765278,
"learning_rate": 9.639755449282874e-06,
"loss": 0.8812,
"step": 120
},
{
"epoch": 0.6388033435987681,
"grad_norm": 1.2734112863554599,
"learning_rate": 9.628188298907782e-06,
"loss": 0.803,
"step": 121
},
{
"epoch": 0.644082710074791,
"grad_norm": 1.4042153797470949,
"learning_rate": 9.616445530527448e-06,
"loss": 0.8159,
"step": 122
},
{
"epoch": 0.6493620765508139,
"grad_norm": 1.230725944316371,
"learning_rate": 9.60452758972477e-06,
"loss": 0.8846,
"step": 123
},
{
"epoch": 0.6546414430268368,
"grad_norm": 1.242349305278167,
"learning_rate": 9.592434928729617e-06,
"loss": 0.7621,
"step": 124
},
{
"epoch": 0.6599208095028597,
"grad_norm": 1.4468948851039833,
"learning_rate": 9.58016800640167e-06,
"loss": 0.9327,
"step": 125
},
{
"epoch": 0.6652001759788825,
"grad_norm": 1.3595406863398718,
"learning_rate": 9.567727288213005e-06,
"loss": 0.8629,
"step": 126
},
{
"epoch": 0.6704795424549054,
"grad_norm": 1.3391481209651412,
"learning_rate": 9.555113246230443e-06,
"loss": 0.943,
"step": 127
},
{
"epoch": 0.6757589089309283,
"grad_norm": 1.3816914835623122,
"learning_rate": 9.542326359097619e-06,
"loss": 0.8841,
"step": 128
},
{
"epoch": 0.6810382754069512,
"grad_norm": 1.2580871478999183,
"learning_rate": 9.529367112016836e-06,
"loss": 0.8039,
"step": 129
},
{
"epoch": 0.6863176418829741,
"grad_norm": 1.3901966106075496,
"learning_rate": 9.516235996730645e-06,
"loss": 0.8409,
"step": 130
},
{
"epoch": 0.691597008358997,
"grad_norm": 1.2698547636051096,
"learning_rate": 9.502933511503187e-06,
"loss": 0.8499,
"step": 131
},
{
"epoch": 0.6968763748350199,
"grad_norm": 1.390987513649665,
"learning_rate": 9.489460161101291e-06,
"loss": 0.8549,
"step": 132
},
{
"epoch": 0.7021557413110426,
"grad_norm": 1.3670503603402828,
"learning_rate": 9.475816456775313e-06,
"loss": 0.8436,
"step": 133
},
{
"epoch": 0.7074351077870655,
"grad_norm": 1.2624531025923733,
"learning_rate": 9.46200291623974e-06,
"loss": 0.853,
"step": 134
},
{
"epoch": 0.7127144742630884,
"grad_norm": 1.2250644357723437,
"learning_rate": 9.44802006365355e-06,
"loss": 0.8283,
"step": 135
},
{
"epoch": 0.7179938407391113,
"grad_norm": 1.2734440256518054,
"learning_rate": 9.43386842960031e-06,
"loss": 0.8474,
"step": 136
},
{
"epoch": 0.7232732072151342,
"grad_norm": 1.3509174047115824,
"learning_rate": 9.419548551068061e-06,
"loss": 0.8657,
"step": 137
},
{
"epoch": 0.7285525736911571,
"grad_norm": 1.370913524291918,
"learning_rate": 9.405060971428924e-06,
"loss": 0.8915,
"step": 138
},
{
"epoch": 0.7338319401671799,
"grad_norm": 1.2433881056858749,
"learning_rate": 9.39040624041849e-06,
"loss": 1.024,
"step": 139
},
{
"epoch": 0.7391113066432028,
"grad_norm": 1.1648887790445894,
"learning_rate": 9.375584914114963e-06,
"loss": 0.7808,
"step": 140
},
{
"epoch": 0.7443906731192257,
"grad_norm": 1.3435997997706464,
"learning_rate": 9.360597554918055e-06,
"loss": 0.9497,
"step": 141
},
{
"epoch": 0.7496700395952486,
"grad_norm": 1.401286458253954,
"learning_rate": 9.345444731527642e-06,
"loss": 0.927,
"step": 142
},
{
"epoch": 0.7549494060712715,
"grad_norm": 1.2883022833187427,
"learning_rate": 9.330127018922195e-06,
"loss": 0.9708,
"step": 143
},
{
"epoch": 0.7602287725472944,
"grad_norm": 1.2683832198503506,
"learning_rate": 9.31464499833695e-06,
"loss": 0.8359,
"step": 144
},
{
"epoch": 0.7655081390233172,
"grad_norm": 1.2280565258082867,
"learning_rate": 9.298999257241862e-06,
"loss": 0.8085,
"step": 145
},
{
"epoch": 0.77078750549934,
"grad_norm": 1.3268019274693263,
"learning_rate": 9.283190389319315e-06,
"loss": 0.8619,
"step": 146
},
{
"epoch": 0.776066871975363,
"grad_norm": 1.3405013825655892,
"learning_rate": 9.26721899444158e-06,
"loss": 0.8757,
"step": 147
},
{
"epoch": 0.7813462384513858,
"grad_norm": 1.3113438616437634,
"learning_rate": 9.251085678648072e-06,
"loss": 0.8586,
"step": 148
},
{
"epoch": 0.7866256049274087,
"grad_norm": 1.2692947439879936,
"learning_rate": 9.234791054122336e-06,
"loss": 0.8315,
"step": 149
},
{
"epoch": 0.7919049714034316,
"grad_norm": 1.288784803692807,
"learning_rate": 9.218335739168833e-06,
"loss": 0.8138,
"step": 150
},
{
"epoch": 0.7971843378794544,
"grad_norm": 1.2853502663214944,
"learning_rate": 9.201720358189464e-06,
"loss": 0.8953,
"step": 151
},
{
"epoch": 0.8024637043554773,
"grad_norm": 1.3844299853873043,
"learning_rate": 9.18494554165989e-06,
"loss": 0.8527,
"step": 152
},
{
"epoch": 0.8077430708315002,
"grad_norm": 1.2348194951175424,
"learning_rate": 9.168011926105598e-06,
"loss": 0.7873,
"step": 153
},
{
"epoch": 0.8130224373075231,
"grad_norm": 1.4444636219923,
"learning_rate": 9.150920154077753e-06,
"loss": 0.9273,
"step": 154
},
{
"epoch": 0.818301803783546,
"grad_norm": 1.3415281982191147,
"learning_rate": 9.133670874128818e-06,
"loss": 0.8165,
"step": 155
},
{
"epoch": 0.8235811702595689,
"grad_norm": 1.4171257480887072,
"learning_rate": 9.116264740787937e-06,
"loss": 0.903,
"step": 156
},
{
"epoch": 0.8288605367355917,
"grad_norm": 1.4009278155261338,
"learning_rate": 9.098702414536107e-06,
"loss": 0.8654,
"step": 157
},
{
"epoch": 0.8341399032116146,
"grad_norm": 1.3353961972609718,
"learning_rate": 9.08098456178111e-06,
"loss": 0.8821,
"step": 158
},
{
"epoch": 0.8394192696876375,
"grad_norm": 1.465378569130035,
"learning_rate": 9.06311185483223e-06,
"loss": 0.8585,
"step": 159
},
{
"epoch": 0.8446986361636604,
"grad_norm": 1.3247392983018136,
"learning_rate": 9.045084971874738e-06,
"loss": 0.8409,
"step": 160
},
{
"epoch": 0.8499780026396833,
"grad_norm": 1.3094980178064088,
"learning_rate": 9.026904596944163e-06,
"loss": 0.8423,
"step": 161
},
{
"epoch": 0.8552573691157062,
"grad_norm": 1.3167256777304588,
"learning_rate": 9.008571419900334e-06,
"loss": 0.8123,
"step": 162
},
{
"epoch": 0.8605367355917289,
"grad_norm": 1.3199382134511854,
"learning_rate": 8.990086136401199e-06,
"loss": 0.8171,
"step": 163
},
{
"epoch": 0.8658161020677518,
"grad_norm": 1.2759439255391014,
"learning_rate": 8.97144944787643e-06,
"loss": 0.7351,
"step": 164
},
{
"epoch": 0.8710954685437747,
"grad_norm": 1.3371401649586945,
"learning_rate": 8.952662061500817e-06,
"loss": 0.9156,
"step": 165
},
{
"epoch": 0.8763748350197976,
"grad_norm": 1.2566756220149857,
"learning_rate": 8.933724690167417e-06,
"loss": 0.9278,
"step": 166
},
{
"epoch": 0.8816542014958205,
"grad_norm": 1.2918012041625928,
"learning_rate": 8.914638052460515e-06,
"loss": 0.796,
"step": 167
},
{
"epoch": 0.8869335679718434,
"grad_norm": 1.3420464341954295,
"learning_rate": 8.895402872628352e-06,
"loss": 0.8289,
"step": 168
},
{
"epoch": 0.8922129344478662,
"grad_norm": 1.386536525968897,
"learning_rate": 8.87601988055565e-06,
"loss": 0.9055,
"step": 169
},
{
"epoch": 0.8974923009238891,
"grad_norm": 1.2764097483742913,
"learning_rate": 8.856489811735904e-06,
"loss": 0.8221,
"step": 170
},
{
"epoch": 0.902771667399912,
"grad_norm": 1.3242925889713713,
"learning_rate": 8.836813407243485e-06,
"loss": 0.8052,
"step": 171
},
{
"epoch": 0.9080510338759349,
"grad_norm": 1.135434673976009,
"learning_rate": 8.816991413705515e-06,
"loss": 0.8048,
"step": 172
},
{
"epoch": 0.9133304003519578,
"grad_norm": 1.3633873651508777,
"learning_rate": 8.797024583273536e-06,
"loss": 0.8403,
"step": 173
},
{
"epoch": 0.9186097668279807,
"grad_norm": 1.4254861868193114,
"learning_rate": 8.776913673594968e-06,
"loss": 0.8558,
"step": 174
},
{
"epoch": 0.9238891333040036,
"grad_norm": 1.3944884567839855,
"learning_rate": 8.756659447784367e-06,
"loss": 0.8265,
"step": 175
},
{
"epoch": 0.9291684997800264,
"grad_norm": 1.2473776372568752,
"learning_rate": 8.736262674394455e-06,
"loss": 0.8558,
"step": 176
},
{
"epoch": 0.9344478662560493,
"grad_norm": 1.1199152507629353,
"learning_rate": 8.715724127386971e-06,
"loss": 0.7684,
"step": 177
},
{
"epoch": 0.9397272327320721,
"grad_norm": 1.4769772323796146,
"learning_rate": 8.695044586103297e-06,
"loss": 0.8404,
"step": 178
},
{
"epoch": 0.945006599208095,
"grad_norm": 1.2812768021421608,
"learning_rate": 8.674224835234879e-06,
"loss": 0.855,
"step": 179
},
{
"epoch": 0.9502859656841179,
"grad_norm": 1.4074704240057607,
"learning_rate": 8.653265664793466e-06,
"loss": 0.8966,
"step": 180
},
{
"epoch": 0.9555653321601408,
"grad_norm": 1.3552977566183917,
"learning_rate": 8.632167870081122e-06,
"loss": 0.8983,
"step": 181
},
{
"epoch": 0.9608446986361636,
"grad_norm": 1.2662415913666043,
"learning_rate": 8.610932251660046e-06,
"loss": 0.7676,
"step": 182
},
{
"epoch": 0.9661240651121865,
"grad_norm": 1.3122048439005143,
"learning_rate": 8.58955961532221e-06,
"loss": 0.8486,
"step": 183
},
{
"epoch": 0.9714034315882094,
"grad_norm": 1.2880133358543706,
"learning_rate": 8.568050772058763e-06,
"loss": 1.0695,
"step": 184
},
{
"epoch": 0.9766827980642323,
"grad_norm": 1.2876646312084281,
"learning_rate": 8.546406538029268e-06,
"loss": 0.8744,
"step": 185
},
{
"epoch": 0.9819621645402552,
"grad_norm": 1.3907216667545839,
"learning_rate": 8.524627734530738e-06,
"loss": 0.8009,
"step": 186
},
{
"epoch": 0.9872415310162781,
"grad_norm": 1.298714668518304,
"learning_rate": 8.502715187966455e-06,
"loss": 0.8211,
"step": 187
},
{
"epoch": 0.9925208974923009,
"grad_norm": 1.4112133312678243,
"learning_rate": 8.480669729814635e-06,
"loss": 0.8909,
"step": 188
},
{
"epoch": 0.9978002639683238,
"grad_norm": 1.2429484880228319,
"learning_rate": 8.458492196596852e-06,
"loss": 0.7842,
"step": 189
},
{
"epoch": 1.0,
"grad_norm": 1.2429484880228319,
"learning_rate": 8.436183429846314e-06,
"loss": 0.8917,
"step": 190
},
{
"epoch": 1.005279366476023,
"grad_norm": 2.3017818904176828,
"learning_rate": 8.413744276075928e-06,
"loss": 0.7453,
"step": 191
},
{
"epoch": 1.0105587329520458,
"grad_norm": 1.223195818545867,
"learning_rate": 8.39117558674617e-06,
"loss": 0.6252,
"step": 192
},
{
"epoch": 1.0158380994280687,
"grad_norm": 1.2015392058187855,
"learning_rate": 8.368478218232787e-06,
"loss": 0.6357,
"step": 193
},
{
"epoch": 1.0211174659040916,
"grad_norm": 1.2194373310662718,
"learning_rate": 8.345653031794292e-06,
"loss": 0.6568,
"step": 194
},
{
"epoch": 1.0263968323801145,
"grad_norm": 1.251074042866813,
"learning_rate": 8.32270089353929e-06,
"loss": 0.6674,
"step": 195
},
{
"epoch": 1.0316761988561374,
"grad_norm": 1.3553593206962593,
"learning_rate": 8.299622674393615e-06,
"loss": 0.7704,
"step": 196
},
{
"epoch": 1.03695556533216,
"grad_norm": 1.3610141965233205,
"learning_rate": 8.27641925006727e-06,
"loss": 0.6893,
"step": 197
},
{
"epoch": 1.042234931808183,
"grad_norm": 1.4430956370832788,
"learning_rate": 8.25309150102121e-06,
"loss": 0.6783,
"step": 198
},
{
"epoch": 1.0475142982842058,
"grad_norm": 1.3237087640173875,
"learning_rate": 8.229640312433938e-06,
"loss": 0.6328,
"step": 199
},
{
"epoch": 1.0527936647602287,
"grad_norm": 1.4354224675777918,
"learning_rate": 8.206066574167893e-06,
"loss": 0.7054,
"step": 200
},
{
"epoch": 1.0580730312362516,
"grad_norm": 1.3820711783982724,
"learning_rate": 8.182371180735708e-06,
"loss": 0.6596,
"step": 201
},
{
"epoch": 1.0633523977122745,
"grad_norm": 1.5892680249474918,
"learning_rate": 8.158555031266255e-06,
"loss": 0.7119,
"step": 202
},
{
"epoch": 1.0686317641882974,
"grad_norm": 1.3831340041775368,
"learning_rate": 8.134619029470535e-06,
"loss": 0.6956,
"step": 203
},
{
"epoch": 1.0739111306643203,
"grad_norm": 1.4618391544645484,
"learning_rate": 8.110564083607371e-06,
"loss": 0.6927,
"step": 204
},
{
"epoch": 1.0791904971403432,
"grad_norm": 1.399225289114619,
"learning_rate": 8.086391106448965e-06,
"loss": 0.6719,
"step": 205
},
{
"epoch": 1.084469863616366,
"grad_norm": 1.39102454168437,
"learning_rate": 8.06210101524625e-06,
"loss": 0.6677,
"step": 206
},
{
"epoch": 1.089749230092389,
"grad_norm": 1.4013045019864605,
"learning_rate": 8.037694731694085e-06,
"loss": 0.6807,
"step": 207
},
{
"epoch": 1.095028596568412,
"grad_norm": 1.3292251495775314,
"learning_rate": 8.013173181896283e-06,
"loss": 0.685,
"step": 208
},
{
"epoch": 1.1003079630444346,
"grad_norm": 1.468780970931853,
"learning_rate": 7.988537296330468e-06,
"loss": 0.6559,
"step": 209
},
{
"epoch": 1.1055873295204575,
"grad_norm": 1.2560486698645472,
"learning_rate": 7.963788009812775e-06,
"loss": 0.5966,
"step": 210
},
{
"epoch": 1.1108666959964804,
"grad_norm": 1.3263436068368955,
"learning_rate": 7.938926261462366e-06,
"loss": 0.6426,
"step": 211
},
{
"epoch": 1.1161460624725033,
"grad_norm": 1.4262360139572436,
"learning_rate": 7.913952994665805e-06,
"loss": 0.7044,
"step": 212
},
{
"epoch": 1.1214254289485261,
"grad_norm": 1.413724127688339,
"learning_rate": 7.888869157041257e-06,
"loss": 0.6892,
"step": 213
},
{
"epoch": 1.126704795424549,
"grad_norm": 1.3691991114364659,
"learning_rate": 7.863675700402527e-06,
"loss": 0.6913,
"step": 214
},
{
"epoch": 1.131984161900572,
"grad_norm": 1.5050181189304115,
"learning_rate": 7.838373580722952e-06,
"loss": 0.7563,
"step": 215
},
{
"epoch": 1.1372635283765948,
"grad_norm": 1.280216474266895,
"learning_rate": 7.812963758099118e-06,
"loss": 0.6034,
"step": 216
},
{
"epoch": 1.1425428948526177,
"grad_norm": 1.3442258228040502,
"learning_rate": 7.787447196714428e-06,
"loss": 0.703,
"step": 217
},
{
"epoch": 1.1478222613286406,
"grad_norm": 1.374879774404637,
"learning_rate": 7.76182486480253e-06,
"loss": 0.6622,
"step": 218
},
{
"epoch": 1.1531016278046635,
"grad_norm": 1.1079594025474235,
"learning_rate": 7.736097734610557e-06,
"loss": 0.6343,
"step": 219
},
{
"epoch": 1.1583809942806864,
"grad_norm": 1.4099196984187832,
"learning_rate": 7.710266782362248e-06,
"loss": 0.7379,
"step": 220
},
{
"epoch": 1.163660360756709,
"grad_norm": 1.3722807994126047,
"learning_rate": 7.684332988220901e-06,
"loss": 0.7447,
"step": 221
},
{
"epoch": 1.168939727232732,
"grad_norm": 1.495776876658676,
"learning_rate": 7.658297336252181e-06,
"loss": 0.6477,
"step": 222
},
{
"epoch": 1.1742190937087549,
"grad_norm": 1.3604596279976626,
"learning_rate": 7.63216081438678e-06,
"loss": 0.7295,
"step": 223
},
{
"epoch": 1.1794984601847778,
"grad_norm": 1.3235758656247603,
"learning_rate": 7.605924414382926e-06,
"loss": 0.6585,
"step": 224
},
{
"epoch": 1.1847778266608007,
"grad_norm": 1.4440449502234758,
"learning_rate": 7.579589131788756e-06,
"loss": 0.6244,
"step": 225
},
{
"epoch": 1.1900571931368236,
"grad_norm": 1.3928149968149692,
"learning_rate": 7.553155965904535e-06,
"loss": 0.637,
"step": 226
},
{
"epoch": 1.1953365596128465,
"grad_norm": 1.4032725114348137,
"learning_rate": 7.526625919744741e-06,
"loss": 0.6644,
"step": 227
},
{
"epoch": 1.2006159260888694,
"grad_norm": 1.3266237278115651,
"learning_rate": 7.500000000000001e-06,
"loss": 0.6354,
"step": 228
},
{
"epoch": 1.2058952925648923,
"grad_norm": 1.5117455894482101,
"learning_rate": 7.473279216998896e-06,
"loss": 0.634,
"step": 229
},
{
"epoch": 1.2111746590409151,
"grad_norm": 1.5645913721329012,
"learning_rate": 7.4464645846696186e-06,
"loss": 0.8021,
"step": 230
},
{
"epoch": 1.216454025516938,
"grad_norm": 1.595585311092696,
"learning_rate": 7.419557120501508e-06,
"loss": 0.6831,
"step": 231
},
{
"epoch": 1.221733391992961,
"grad_norm": 1.3739554991120078,
"learning_rate": 7.392557845506433e-06,
"loss": 0.6571,
"step": 232
},
{
"epoch": 1.2270127584689838,
"grad_norm": 1.383758484574002,
"learning_rate": 7.365467784180051e-06,
"loss": 0.6015,
"step": 233
},
{
"epoch": 1.2322921249450065,
"grad_norm": 1.2935886046335063,
"learning_rate": 7.3382879644629345e-06,
"loss": 0.684,
"step": 234
},
{
"epoch": 1.2375714914210294,
"grad_norm": 1.4930967440370626,
"learning_rate": 7.311019417701567e-06,
"loss": 0.618,
"step": 235
},
{
"epoch": 1.2428508578970523,
"grad_norm": 1.4340994519601895,
"learning_rate": 7.283663178609204e-06,
"loss": 0.6676,
"step": 236
},
{
"epoch": 1.2481302243730752,
"grad_norm": 1.332079262932709,
"learning_rate": 7.256220285226615e-06,
"loss": 0.6518,
"step": 237
},
{
"epoch": 1.253409590849098,
"grad_norm": 1.4124012184704442,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.7255,
"step": 238
},
{
"epoch": 1.258688957325121,
"grad_norm": 1.435352374027868,
"learning_rate": 7.201078704154938e-06,
"loss": 0.6427,
"step": 239
},
{
"epoch": 1.2639683238011439,
"grad_norm": 1.3129409573708608,
"learning_rate": 7.173382108829826e-06,
"loss": 0.6435,
"step": 240
},
{
"epoch": 1.2692476902771668,
"grad_norm": 1.3029674291538322,
"learning_rate": 7.145603043863045e-06,
"loss": 0.6018,
"step": 241
},
{
"epoch": 1.2745270567531897,
"grad_norm": 1.4384137050989114,
"learning_rate": 7.117742563339622e-06,
"loss": 0.6399,
"step": 242
},
{
"epoch": 1.2798064232292126,
"grad_norm": 1.404893956226061,
"learning_rate": 7.089801724433918e-06,
"loss": 0.6591,
"step": 243
},
{
"epoch": 1.2850857897052355,
"grad_norm": 1.3631545091800101,
"learning_rate": 7.061781587369518e-06,
"loss": 0.661,
"step": 244
},
{
"epoch": 1.2903651561812581,
"grad_norm": 1.3722052435590018,
"learning_rate": 7.033683215379002e-06,
"loss": 0.7329,
"step": 245
},
{
"epoch": 1.2956445226572813,
"grad_norm": 1.3373371627523003,
"learning_rate": 7.005507674663594e-06,
"loss": 0.6853,
"step": 246
},
{
"epoch": 1.300923889133304,
"grad_norm": 1.347515123739391,
"learning_rate": 6.977256034352713e-06,
"loss": 0.6356,
"step": 247
},
{
"epoch": 1.3062032556093268,
"grad_norm": 1.3251248469180115,
"learning_rate": 6.948929366463397e-06,
"loss": 0.6542,
"step": 248
},
{
"epoch": 1.3114826220853497,
"grad_norm": 1.413403738053324,
"learning_rate": 6.9205287458596305e-06,
"loss": 0.6732,
"step": 249
},
{
"epoch": 1.3167619885613726,
"grad_norm": 1.3195739239798052,
"learning_rate": 6.892055250211552e-06,
"loss": 0.6157,
"step": 250
},
{
"epoch": 1.3220413550373955,
"grad_norm": 1.398192094348221,
"learning_rate": 6.86350995995457e-06,
"loss": 0.6903,
"step": 251
},
{
"epoch": 1.3273207215134184,
"grad_norm": 1.4424925485085278,
"learning_rate": 6.834893958248361e-06,
"loss": 0.6967,
"step": 252
},
{
"epoch": 1.3326000879894413,
"grad_norm": 1.3715769454036013,
"learning_rate": 6.806208330935766e-06,
"loss": 0.6402,
"step": 253
},
{
"epoch": 1.3378794544654642,
"grad_norm": 1.3832262511831421,
"learning_rate": 6.77745416650159e-06,
"loss": 0.6684,
"step": 254
},
{
"epoch": 1.343158820941487,
"grad_norm": 1.3503209557607232,
"learning_rate": 6.748632556031306e-06,
"loss": 0.7828,
"step": 255
},
{
"epoch": 1.34843818741751,
"grad_norm": 1.3619508308924722,
"learning_rate": 6.719744593169642e-06,
"loss": 0.6583,
"step": 256
},
{
"epoch": 1.3537175538935329,
"grad_norm": 1.543700428502048,
"learning_rate": 6.690791374079086e-06,
"loss": 0.6687,
"step": 257
},
{
"epoch": 1.3589969203695555,
"grad_norm": 1.3454959558325137,
"learning_rate": 6.6617739973982985e-06,
"loss": 0.6109,
"step": 258
},
{
"epoch": 1.3642762868455787,
"grad_norm": 1.3091432151076758,
"learning_rate": 6.6326935642004165e-06,
"loss": 0.6819,
"step": 259
},
{
"epoch": 1.3695556533216013,
"grad_norm": 1.4211398893275302,
"learning_rate": 6.6035511779512764e-06,
"loss": 0.6106,
"step": 260
},
{
"epoch": 1.3748350197976242,
"grad_norm": 1.3056584370485818,
"learning_rate": 6.57434794446754e-06,
"loss": 0.6348,
"step": 261
},
{
"epoch": 1.3801143862736471,
"grad_norm": 1.432042689389407,
"learning_rate": 6.545084971874738e-06,
"loss": 0.6428,
"step": 262
},
{
"epoch": 1.38539375274967,
"grad_norm": 1.3426884980712488,
"learning_rate": 6.515763370565218e-06,
"loss": 0.6076,
"step": 263
},
{
"epoch": 1.390673119225693,
"grad_norm": 1.2615012752998496,
"learning_rate": 6.486384253156014e-06,
"loss": 0.7665,
"step": 264
},
{
"epoch": 1.3959524857017158,
"grad_norm": 1.3453647581013601,
"learning_rate": 6.456948734446624e-06,
"loss": 0.6377,
"step": 265
},
{
"epoch": 1.4012318521777387,
"grad_norm": 1.4161701822318469,
"learning_rate": 6.427457931376712e-06,
"loss": 0.6732,
"step": 266
},
{
"epoch": 1.4065112186537616,
"grad_norm": 1.32784380027798,
"learning_rate": 6.39791296298372e-06,
"loss": 0.628,
"step": 267
},
{
"epoch": 1.4117905851297845,
"grad_norm": 1.3941543313635256,
"learning_rate": 6.368314950360416e-06,
"loss": 0.6554,
"step": 268
},
{
"epoch": 1.4170699516058072,
"grad_norm": 1.4362476191327336,
"learning_rate": 6.3386650166123406e-06,
"loss": 0.7686,
"step": 269
},
{
"epoch": 1.4223493180818303,
"grad_norm": 1.4890178566624934,
"learning_rate": 6.308964286815203e-06,
"loss": 0.6515,
"step": 270
},
{
"epoch": 1.427628684557853,
"grad_norm": 1.52171364834682,
"learning_rate": 6.279213887972179e-06,
"loss": 0.6851,
"step": 271
},
{
"epoch": 1.4329080510338759,
"grad_norm": 1.170459513215867,
"learning_rate": 6.249414948971154e-06,
"loss": 0.634,
"step": 272
},
{
"epoch": 1.4381874175098988,
"grad_norm": 1.2892794847690134,
"learning_rate": 6.219568600541886e-06,
"loss": 0.5732,
"step": 273
},
{
"epoch": 1.4434667839859217,
"grad_norm": 1.4147522223651536,
"learning_rate": 6.189675975213094e-06,
"loss": 0.6505,
"step": 274
},
{
"epoch": 1.4487461504619445,
"grad_norm": 1.2766098284530847,
"learning_rate": 6.159738207269491e-06,
"loss": 0.613,
"step": 275
},
{
"epoch": 1.4540255169379674,
"grad_norm": 1.3170993095662313,
"learning_rate": 6.129756432708739e-06,
"loss": 0.6058,
"step": 276
},
{
"epoch": 1.4593048834139903,
"grad_norm": 1.3490297596989358,
"learning_rate": 6.099731789198344e-06,
"loss": 0.7526,
"step": 277
},
{
"epoch": 1.4645842498900132,
"grad_norm": 1.49015141935795,
"learning_rate": 6.0696654160324875e-06,
"loss": 0.6664,
"step": 278
},
{
"epoch": 1.4698636163660361,
"grad_norm": 1.353677527773509,
"learning_rate": 6.039558454088796e-06,
"loss": 0.6508,
"step": 279
},
{
"epoch": 1.475142982842059,
"grad_norm": 1.3542791249145698,
"learning_rate": 6.009412045785051e-06,
"loss": 0.6868,
"step": 280
},
{
"epoch": 1.480422349318082,
"grad_norm": 1.239412403087578,
"learning_rate": 5.9792273350358354e-06,
"loss": 0.6542,
"step": 281
},
{
"epoch": 1.4857017157941046,
"grad_norm": 1.3657653754563595,
"learning_rate": 5.9490054672091305e-06,
"loss": 0.695,
"step": 282
},
{
"epoch": 1.4909810822701277,
"grad_norm": 1.4092314752807444,
"learning_rate": 5.918747589082853e-06,
"loss": 0.6472,
"step": 283
},
{
"epoch": 1.4962604487461504,
"grad_norm": 1.518575708392721,
"learning_rate": 5.888454848801345e-06,
"loss": 0.6623,
"step": 284
},
{
"epoch": 1.5015398152221735,
"grad_norm": 1.4295896368916283,
"learning_rate": 5.8581283958317995e-06,
"loss": 0.7579,
"step": 285
},
{
"epoch": 1.5068191816981962,
"grad_norm": 1.479897530210997,
"learning_rate": 5.82776938092065e-06,
"loss": 0.7334,
"step": 286
},
{
"epoch": 1.512098548174219,
"grad_norm": 1.4366013380091691,
"learning_rate": 5.797378956049905e-06,
"loss": 0.6739,
"step": 287
},
{
"epoch": 1.517377914650242,
"grad_norm": 1.4716566746195219,
"learning_rate": 5.766958274393428e-06,
"loss": 0.7233,
"step": 288
},
{
"epoch": 1.5226572811262649,
"grad_norm": 1.3374013752311613,
"learning_rate": 5.736508490273189e-06,
"loss": 0.6999,
"step": 289
},
{
"epoch": 1.5279366476022878,
"grad_norm": 1.404502862270622,
"learning_rate": 5.706030759115458e-06,
"loss": 0.6502,
"step": 290
},
{
"epoch": 1.5332160140783107,
"grad_norm": 1.3895925622506242,
"learning_rate": 5.675526237406965e-06,
"loss": 0.6693,
"step": 291
},
{
"epoch": 1.5384953805543335,
"grad_norm": 1.3933211625692163,
"learning_rate": 5.644996082651018e-06,
"loss": 0.6272,
"step": 292
},
{
"epoch": 1.5437747470303562,
"grad_norm": 1.2462836635087724,
"learning_rate": 5.614441453323571e-06,
"loss": 0.6725,
"step": 293
},
{
"epoch": 1.5490541135063793,
"grad_norm": 1.4996013016049565,
"learning_rate": 5.583863508829281e-06,
"loss": 0.6956,
"step": 294
},
{
"epoch": 1.554333479982402,
"grad_norm": 1.3766256340590475,
"learning_rate": 5.553263409457504e-06,
"loss": 0.659,
"step": 295
},
{
"epoch": 1.5596128464584251,
"grad_norm": 1.3350837109105465,
"learning_rate": 5.522642316338268e-06,
"loss": 0.6357,
"step": 296
},
{
"epoch": 1.5648922129344478,
"grad_norm": 1.3570996604619927,
"learning_rate": 5.492001391398214e-06,
"loss": 0.6544,
"step": 297
},
{
"epoch": 1.5701715794104707,
"grad_norm": 1.4608558691508997,
"learning_rate": 5.46134179731651e-06,
"loss": 0.6512,
"step": 298
},
{
"epoch": 1.5754509458864936,
"grad_norm": 1.2494448543139998,
"learning_rate": 5.430664697480731e-06,
"loss": 0.5658,
"step": 299
},
{
"epoch": 1.5807303123625165,
"grad_norm": 1.444693017380396,
"learning_rate": 5.399971255942708e-06,
"loss": 0.6901,
"step": 300
},
{
"epoch": 1.5860096788385394,
"grad_norm": 1.4186391329903683,
"learning_rate": 5.36926263737437e-06,
"loss": 0.8807,
"step": 301
},
{
"epoch": 1.5912890453145623,
"grad_norm": 1.29633534515009,
"learning_rate": 5.338540007023538e-06,
"loss": 0.6461,
"step": 302
},
{
"epoch": 1.5965684117905852,
"grad_norm": 1.4448726879769416,
"learning_rate": 5.3078045306697154e-06,
"loss": 0.6523,
"step": 303
},
{
"epoch": 1.6018477782666078,
"grad_norm": 1.266507195220378,
"learning_rate": 5.27705737457985e-06,
"loss": 0.6408,
"step": 304
},
{
"epoch": 1.607127144742631,
"grad_norm": 1.3540583386343656,
"learning_rate": 5.246299705464085e-06,
"loss": 0.6488,
"step": 305
},
{
"epoch": 1.6124065112186536,
"grad_norm": 1.343878144578292,
"learning_rate": 5.2155326904314795e-06,
"loss": 0.6031,
"step": 306
},
{
"epoch": 1.6176858776946768,
"grad_norm": 1.390922633295502,
"learning_rate": 5.184757496945726e-06,
"loss": 0.6732,
"step": 307
},
{
"epoch": 1.6229652441706994,
"grad_norm": 1.303700297184845,
"learning_rate": 5.153975292780852e-06,
"loss": 0.644,
"step": 308
},
{
"epoch": 1.6282446106467225,
"grad_norm": 1.4719857684130002,
"learning_rate": 5.123187245976912e-06,
"loss": 0.6542,
"step": 309
},
{
"epoch": 1.6335239771227452,
"grad_norm": 1.5316116004451763,
"learning_rate": 5.09239452479565e-06,
"loss": 0.6741,
"step": 310
},
{
"epoch": 1.6388033435987681,
"grad_norm": 1.5058092447545324,
"learning_rate": 5.061598297676192e-06,
"loss": 0.6624,
"step": 311
},
{
"epoch": 1.644082710074791,
"grad_norm": 1.2957852805869594,
"learning_rate": 5.030799733190694e-06,
"loss": 0.6866,
"step": 312
},
{
"epoch": 1.649362076550814,
"grad_norm": 1.3465817125883073,
"learning_rate": 5e-06,
"loss": 0.665,
"step": 313
},
{
"epoch": 1.6546414430268368,
"grad_norm": 1.2222436930506864,
"learning_rate": 4.9692002668093075e-06,
"loss": 0.5887,
"step": 314
},
{
"epoch": 1.6599208095028597,
"grad_norm": 1.441331154425715,
"learning_rate": 4.9384017023238085e-06,
"loss": 0.673,
"step": 315
},
{
"epoch": 1.6652001759788826,
"grad_norm": 1.3514338153223537,
"learning_rate": 4.907605475204352e-06,
"loss": 0.7095,
"step": 316
},
{
"epoch": 1.6704795424549053,
"grad_norm": 1.4614586482457859,
"learning_rate": 4.876812754023092e-06,
"loss": 0.7205,
"step": 317
},
{
"epoch": 1.6757589089309284,
"grad_norm": 1.3928056564895086,
"learning_rate": 4.846024707219149e-06,
"loss": 0.6358,
"step": 318
},
{
"epoch": 1.681038275406951,
"grad_norm": 1.3113749307682454,
"learning_rate": 4.815242503054277e-06,
"loss": 0.6465,
"step": 319
},
{
"epoch": 1.6863176418829742,
"grad_norm": 1.4316497180240197,
"learning_rate": 4.784467309568524e-06,
"loss": 0.6794,
"step": 320
},
{
"epoch": 1.6915970083589968,
"grad_norm": 1.3763481895692722,
"learning_rate": 4.753700294535916e-06,
"loss": 0.7105,
"step": 321
},
{
"epoch": 1.69687637483502,
"grad_norm": 1.3560535615138942,
"learning_rate": 4.7229426254201504e-06,
"loss": 0.6566,
"step": 322
},
{
"epoch": 1.7021557413110426,
"grad_norm": 1.3113897207300194,
"learning_rate": 4.692195469330286e-06,
"loss": 0.753,
"step": 323
},
{
"epoch": 1.7074351077870655,
"grad_norm": 1.2314416333529012,
"learning_rate": 4.661459992976463e-06,
"loss": 0.6087,
"step": 324
},
{
"epoch": 1.7127144742630884,
"grad_norm": 1.357070932304121,
"learning_rate": 4.630737362625631e-06,
"loss": 0.678,
"step": 325
},
{
"epoch": 1.7179938407391113,
"grad_norm": 1.3907841932602958,
"learning_rate": 4.6000287440572925e-06,
"loss": 0.6819,
"step": 326
},
{
"epoch": 1.7232732072151342,
"grad_norm": 1.3821824046618116,
"learning_rate": 4.569335302519271e-06,
"loss": 0.6329,
"step": 327
},
{
"epoch": 1.7285525736911571,
"grad_norm": 1.4473432204015564,
"learning_rate": 4.53865820268349e-06,
"loss": 0.7144,
"step": 328
},
{
"epoch": 1.73383194016718,
"grad_norm": 1.4376742031177947,
"learning_rate": 4.507998608601787e-06,
"loss": 0.6086,
"step": 329
},
{
"epoch": 1.7391113066432027,
"grad_norm": 1.2849628847256984,
"learning_rate": 4.477357683661734e-06,
"loss": 0.6101,
"step": 330
},
{
"epoch": 1.7443906731192258,
"grad_norm": 1.3554057763386258,
"learning_rate": 4.446736590542497e-06,
"loss": 0.5833,
"step": 331
},
{
"epoch": 1.7496700395952485,
"grad_norm": 1.3213798453951964,
"learning_rate": 4.41613649117072e-06,
"loss": 0.6859,
"step": 332
},
{
"epoch": 1.7549494060712716,
"grad_norm": 1.319837554365992,
"learning_rate": 4.3855585466764305e-06,
"loss": 0.655,
"step": 333
},
{
"epoch": 1.7602287725472943,
"grad_norm": 1.3686144434660683,
"learning_rate": 4.355003917348985e-06,
"loss": 0.6474,
"step": 334
},
{
"epoch": 1.7655081390233172,
"grad_norm": 1.3793264604803168,
"learning_rate": 4.324473762593037e-06,
"loss": 0.5843,
"step": 335
},
{
"epoch": 1.77078750549934,
"grad_norm": 1.3441234479337094,
"learning_rate": 4.293969240884545e-06,
"loss": 0.5984,
"step": 336
},
{
"epoch": 1.776066871975363,
"grad_norm": 1.237308449464165,
"learning_rate": 4.263491509726812e-06,
"loss": 0.6477,
"step": 337
},
{
"epoch": 1.7813462384513858,
"grad_norm": 1.3355474025021052,
"learning_rate": 4.233041725606573e-06,
"loss": 0.636,
"step": 338
},
{
"epoch": 1.7866256049274087,
"grad_norm": 1.3458947073703338,
"learning_rate": 4.202621043950096e-06,
"loss": 0.6152,
"step": 339
},
{
"epoch": 1.7919049714034316,
"grad_norm": 1.3724772310082562,
"learning_rate": 4.17223061907935e-06,
"loss": 0.6669,
"step": 340
},
{
"epoch": 1.7971843378794543,
"grad_norm": 1.3927314177261432,
"learning_rate": 4.141871604168201e-06,
"loss": 0.6871,
"step": 341
},
{
"epoch": 1.8024637043554774,
"grad_norm": 1.425898039985732,
"learning_rate": 4.111545151198657e-06,
"loss": 0.6479,
"step": 342
},
{
"epoch": 1.8077430708315,
"grad_norm": 1.4786764449830878,
"learning_rate": 4.081252410917148e-06,
"loss": 0.6758,
"step": 343
},
{
"epoch": 1.8130224373075232,
"grad_norm": 1.5596067837918601,
"learning_rate": 4.050994532790871e-06,
"loss": 0.6792,
"step": 344
},
{
"epoch": 1.818301803783546,
"grad_norm": 1.3295616520702254,
"learning_rate": 4.020772664964166e-06,
"loss": 0.6447,
"step": 345
},
{
"epoch": 1.823581170259569,
"grad_norm": 1.356711049558864,
"learning_rate": 3.99058795421495e-06,
"loss": 0.6988,
"step": 346
},
{
"epoch": 1.8288605367355917,
"grad_norm": 1.3459848599920097,
"learning_rate": 3.960441545911205e-06,
"loss": 0.6793,
"step": 347
},
{
"epoch": 1.8341399032116146,
"grad_norm": 1.2796136680768018,
"learning_rate": 3.930334583967514e-06,
"loss": 0.6404,
"step": 348
},
{
"epoch": 1.8394192696876375,
"grad_norm": 1.430373997763793,
"learning_rate": 3.9002682108016585e-06,
"loss": 0.7089,
"step": 349
},
{
"epoch": 1.8446986361636604,
"grad_norm": 1.443868769930965,
"learning_rate": 3.870243567291263e-06,
"loss": 0.6088,
"step": 350
},
{
"epoch": 1.8499780026396833,
"grad_norm": 1.3354422816955691,
"learning_rate": 3.840261792730511e-06,
"loss": 0.6469,
"step": 351
},
{
"epoch": 1.8552573691157062,
"grad_norm": 1.5007921332211551,
"learning_rate": 3.8103240247869077e-06,
"loss": 0.6917,
"step": 352
},
{
"epoch": 1.860536735591729,
"grad_norm": 1.4369902254697013,
"learning_rate": 3.7804313994581143e-06,
"loss": 0.6745,
"step": 353
},
{
"epoch": 1.8658161020677517,
"grad_norm": 1.2954980945001948,
"learning_rate": 3.7505850510288455e-06,
"loss": 0.6402,
"step": 354
},
{
"epoch": 1.8710954685437748,
"grad_norm": 1.3479250337805435,
"learning_rate": 3.720786112027822e-06,
"loss": 0.6281,
"step": 355
},
{
"epoch": 1.8763748350197975,
"grad_norm": 1.3528354522984527,
"learning_rate": 3.6910357131847986e-06,
"loss": 0.6253,
"step": 356
},
{
"epoch": 1.8816542014958206,
"grad_norm": 1.2410984546080153,
"learning_rate": 3.6613349833876607e-06,
"loss": 0.5576,
"step": 357
},
{
"epoch": 1.8869335679718433,
"grad_norm": 1.1827340907861352,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.5936,
"step": 358
},
{
"epoch": 1.8922129344478662,
"grad_norm": 1.2980573086194132,
"learning_rate": 3.602087037016281e-06,
"loss": 0.8214,
"step": 359
},
{
"epoch": 1.897492300923889,
"grad_norm": 1.4315757982637016,
"learning_rate": 3.5725420686232903e-06,
"loss": 0.6522,
"step": 360
},
{
"epoch": 1.902771667399912,
"grad_norm": 1.4091204255580805,
"learning_rate": 3.5430512655533774e-06,
"loss": 0.5795,
"step": 361
},
{
"epoch": 1.9080510338759349,
"grad_norm": 1.3444722372985694,
"learning_rate": 3.513615746843987e-06,
"loss": 0.7231,
"step": 362
},
{
"epoch": 1.9133304003519578,
"grad_norm": 1.5246355682127404,
"learning_rate": 3.484236629434783e-06,
"loss": 0.6603,
"step": 363
},
{
"epoch": 1.9186097668279807,
"grad_norm": 1.415464008217028,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.6775,
"step": 364
},
{
"epoch": 1.9238891333040036,
"grad_norm": 1.3192883237623132,
"learning_rate": 3.4256520555324613e-06,
"loss": 0.6316,
"step": 365
},
{
"epoch": 1.9291684997800265,
"grad_norm": 1.428352611949904,
"learning_rate": 3.3964488220487252e-06,
"loss": 0.6544,
"step": 366
},
{
"epoch": 1.9344478662560491,
"grad_norm": 1.5172404820075067,
"learning_rate": 3.3673064357995844e-06,
"loss": 0.5938,
"step": 367
},
{
"epoch": 1.9397272327320723,
"grad_norm": 1.3819196499385575,
"learning_rate": 3.3382260026017027e-06,
"loss": 0.658,
"step": 368
},
{
"epoch": 1.945006599208095,
"grad_norm": 1.2219098246955071,
"learning_rate": 3.3092086259209144e-06,
"loss": 0.6436,
"step": 369
},
{
"epoch": 1.950285965684118,
"grad_norm": 1.3112983916512726,
"learning_rate": 3.2802554068303595e-06,
"loss": 0.6277,
"step": 370
},
{
"epoch": 1.9555653321601407,
"grad_norm": 1.3181733253990144,
"learning_rate": 3.2513674439686945e-06,
"loss": 0.6051,
"step": 371
},
{
"epoch": 1.9608446986361636,
"grad_norm": 1.3950428699097805,
"learning_rate": 3.22254583349841e-06,
"loss": 0.6047,
"step": 372
},
{
"epoch": 1.9661240651121865,
"grad_norm": 1.401575375536184,
"learning_rate": 3.1937916690642356e-06,
"loss": 0.7536,
"step": 373
},
{
"epoch": 1.9714034315882094,
"grad_norm": 1.4675958168712424,
"learning_rate": 3.16510604175164e-06,
"loss": 0.6029,
"step": 374
},
{
"epoch": 1.9766827980642323,
"grad_norm": 1.2821606191811943,
"learning_rate": 3.13649004004543e-06,
"loss": 0.5921,
"step": 375
},
{
"epoch": 1.9819621645402552,
"grad_norm": 1.352642460803231,
"learning_rate": 3.107944749788449e-06,
"loss": 0.6541,
"step": 376
},
{
"epoch": 1.987241531016278,
"grad_norm": 1.4500846871984014,
"learning_rate": 3.0794712541403716e-06,
"loss": 0.5958,
"step": 377
},
{
"epoch": 1.9925208974923008,
"grad_norm": 1.344043810326557,
"learning_rate": 3.0510706335366034e-06,
"loss": 0.678,
"step": 378
},
{
"epoch": 1.9978002639683239,
"grad_norm": 1.3311569208901686,
"learning_rate": 3.0227439656472878e-06,
"loss": 0.5696,
"step": 379
},
{
"epoch": 2.0,
"grad_norm": 1.3311569208901686,
"learning_rate": 2.9944923253364066e-06,
"loss": 0.6124,
"step": 380
},
{
"epoch": 2.0052793664760227,
"grad_norm": 2.2453037842894434,
"learning_rate": 2.966316784621e-06,
"loss": 0.5021,
"step": 381
},
{
"epoch": 2.010558732952046,
"grad_norm": 1.448400627279987,
"learning_rate": 2.9382184126304834e-06,
"loss": 0.4754,
"step": 382
},
{
"epoch": 2.0158380994280685,
"grad_norm": 1.3928060588224582,
"learning_rate": 2.910198275566085e-06,
"loss": 0.5552,
"step": 383
},
{
"epoch": 2.0211174659040916,
"grad_norm": 1.3053890423029602,
"learning_rate": 2.8822574366603804e-06,
"loss": 0.5096,
"step": 384
},
{
"epoch": 2.0263968323801143,
"grad_norm": 1.312006902737542,
"learning_rate": 2.8543969561369556e-06,
"loss": 0.4384,
"step": 385
},
{
"epoch": 2.0316761988561374,
"grad_norm": 1.3424294511811377,
"learning_rate": 2.8266178911701757e-06,
"loss": 0.4524,
"step": 386
},
{
"epoch": 2.03695556533216,
"grad_norm": 1.353115219108087,
"learning_rate": 2.798921295845064e-06,
"loss": 0.5847,
"step": 387
},
{
"epoch": 2.042234931808183,
"grad_norm": 1.359151667072332,
"learning_rate": 2.771308221117309e-06,
"loss": 0.5274,
"step": 388
},
{
"epoch": 2.047514298284206,
"grad_norm": 1.1804464420852299,
"learning_rate": 2.743779714773386e-06,
"loss": 0.4868,
"step": 389
},
{
"epoch": 2.052793664760229,
"grad_norm": 1.2257778269575734,
"learning_rate": 2.7163368213907975e-06,
"loss": 0.4974,
"step": 390
},
{
"epoch": 2.0580730312362516,
"grad_norm": 1.3592658717787198,
"learning_rate": 2.6889805822984348e-06,
"loss": 0.429,
"step": 391
},
{
"epoch": 2.0633523977122747,
"grad_norm": 1.4335615689021757,
"learning_rate": 2.6617120355370667e-06,
"loss": 0.4936,
"step": 392
},
{
"epoch": 2.0686317641882974,
"grad_norm": 1.4971943650916089,
"learning_rate": 2.6345322158199503e-06,
"loss": 0.4891,
"step": 393
},
{
"epoch": 2.07391113066432,
"grad_norm": 1.3630600653303417,
"learning_rate": 2.607442154493568e-06,
"loss": 0.4339,
"step": 394
},
{
"epoch": 2.079190497140343,
"grad_norm": 1.3452492502199729,
"learning_rate": 2.5804428794984926e-06,
"loss": 0.4788,
"step": 395
},
{
"epoch": 2.084469863616366,
"grad_norm": 1.3717266175336726,
"learning_rate": 2.5535354153303827e-06,
"loss": 0.4589,
"step": 396
},
{
"epoch": 2.089749230092389,
"grad_norm": 1.4172021390970797,
"learning_rate": 2.526720783001107e-06,
"loss": 0.474,
"step": 397
},
{
"epoch": 2.0950285965684117,
"grad_norm": 1.4012306931647633,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.4703,
"step": 398
},
{
"epoch": 2.100307963044435,
"grad_norm": 1.3223744927738885,
"learning_rate": 2.473374080255261e-06,
"loss": 0.4927,
"step": 399
},
{
"epoch": 2.1055873295204575,
"grad_norm": 1.4340111903752237,
"learning_rate": 2.4468440340954664e-06,
"loss": 0.4911,
"step": 400
},
{
"epoch": 2.1108666959964806,
"grad_norm": 1.4908200109241228,
"learning_rate": 2.4204108682112443e-06,
"loss": 0.4923,
"step": 401
},
{
"epoch": 2.1161460624725033,
"grad_norm": 1.4498803710381125,
"learning_rate": 2.3940755856170744e-06,
"loss": 0.5264,
"step": 402
},
{
"epoch": 2.1214254289485264,
"grad_norm": 1.5579303480326079,
"learning_rate": 2.3678391856132203e-06,
"loss": 0.4671,
"step": 403
},
{
"epoch": 2.126704795424549,
"grad_norm": 1.3682695560666365,
"learning_rate": 2.341702663747819e-06,
"loss": 0.4791,
"step": 404
},
{
"epoch": 2.131984161900572,
"grad_norm": 1.3740119474315011,
"learning_rate": 2.3156670117790996e-06,
"loss": 0.5081,
"step": 405
},
{
"epoch": 2.137263528376595,
"grad_norm": 1.4378876285894175,
"learning_rate": 2.289733217637753e-06,
"loss": 0.6887,
"step": 406
},
{
"epoch": 2.1425428948526175,
"grad_norm": 1.3945669735187922,
"learning_rate": 2.2639022653894443e-06,
"loss": 0.4747,
"step": 407
},
{
"epoch": 2.1478222613286406,
"grad_norm": 1.5102801606024971,
"learning_rate": 2.238175135197471e-06,
"loss": 0.4772,
"step": 408
},
{
"epoch": 2.1531016278046633,
"grad_norm": 1.343039872751995,
"learning_rate": 2.2125528032855727e-06,
"loss": 0.4662,
"step": 409
},
{
"epoch": 2.1583809942806864,
"grad_norm": 1.7777099063240305,
"learning_rate": 2.1870362419008844e-06,
"loss": 0.4426,
"step": 410
},
{
"epoch": 2.163660360756709,
"grad_norm": 1.3840206283915173,
"learning_rate": 2.1616264192770496e-06,
"loss": 0.451,
"step": 411
},
{
"epoch": 2.168939727232732,
"grad_norm": 1.2930950541912372,
"learning_rate": 2.136324299597474e-06,
"loss": 0.523,
"step": 412
},
{
"epoch": 2.174219093708755,
"grad_norm": 1.4431983442764567,
"learning_rate": 2.1111308429587446e-06,
"loss": 0.4051,
"step": 413
},
{
"epoch": 2.179498460184778,
"grad_norm": 1.2550043379116107,
"learning_rate": 2.0860470053341957e-06,
"loss": 0.499,
"step": 414
},
{
"epoch": 2.1847778266608007,
"grad_norm": 1.4359151954027813,
"learning_rate": 2.061073738537635e-06,
"loss": 0.4591,
"step": 415
},
{
"epoch": 2.190057193136824,
"grad_norm": 1.3446535561143784,
"learning_rate": 2.0362119901872262e-06,
"loss": 0.4076,
"step": 416
},
{
"epoch": 2.1953365596128465,
"grad_norm": 1.2459969483698727,
"learning_rate": 2.011462703669532e-06,
"loss": 0.4957,
"step": 417
},
{
"epoch": 2.200615926088869,
"grad_norm": 1.2992424145598012,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.463,
"step": 418
},
{
"epoch": 2.2058952925648923,
"grad_norm": 1.4602888158466671,
"learning_rate": 1.9623052683059164e-06,
"loss": 0.4719,
"step": 419
},
{
"epoch": 2.211174659040915,
"grad_norm": 1.3759597249907445,
"learning_rate": 1.937898984753751e-06,
"loss": 0.4951,
"step": 420
},
{
"epoch": 2.216454025516938,
"grad_norm": 1.412150518484592,
"learning_rate": 1.913608893551036e-06,
"loss": 0.52,
"step": 421
},
{
"epoch": 2.2217333919929607,
"grad_norm": 1.485490691650101,
"learning_rate": 1.8894359163926312e-06,
"loss": 0.444,
"step": 422
},
{
"epoch": 2.227012758468984,
"grad_norm": 1.3670795340613098,
"learning_rate": 1.865380970529469e-06,
"loss": 0.5399,
"step": 423
},
{
"epoch": 2.2322921249450065,
"grad_norm": 1.3525729496527066,
"learning_rate": 1.8414449687337467e-06,
"loss": 0.5159,
"step": 424
},
{
"epoch": 2.2375714914210296,
"grad_norm": 1.4059006780837846,
"learning_rate": 1.8176288192642944e-06,
"loss": 0.5099,
"step": 425
},
{
"epoch": 2.2428508578970523,
"grad_norm": 1.2761502912826002,
"learning_rate": 1.7939334258321094e-06,
"loss": 0.4717,
"step": 426
},
{
"epoch": 2.2481302243730754,
"grad_norm": 1.4040503918498035,
"learning_rate": 1.7703596875660645e-06,
"loss": 0.4469,
"step": 427
},
{
"epoch": 2.253409590849098,
"grad_norm": 1.2908543753758535,
"learning_rate": 1.746908498978791e-06,
"loss": 0.485,
"step": 428
},
{
"epoch": 2.2586889573251208,
"grad_norm": 1.3759893346792271,
"learning_rate": 1.7235807499327335e-06,
"loss": 0.5101,
"step": 429
},
{
"epoch": 2.263968323801144,
"grad_norm": 1.4728227490351313,
"learning_rate": 1.7003773256063882e-06,
"loss": 0.5347,
"step": 430
},
{
"epoch": 2.2692476902771666,
"grad_norm": 1.489969037850193,
"learning_rate": 1.6772991064607113e-06,
"loss": 0.4467,
"step": 431
},
{
"epoch": 2.2745270567531897,
"grad_norm": 1.417700490467943,
"learning_rate": 1.6543469682057105e-06,
"loss": 0.481,
"step": 432
},
{
"epoch": 2.2798064232292123,
"grad_norm": 1.386346425394359,
"learning_rate": 1.6315217817672142e-06,
"loss": 0.4621,
"step": 433
},
{
"epoch": 2.2850857897052355,
"grad_norm": 1.2589332584059243,
"learning_rate": 1.60882441325383e-06,
"loss": 0.5175,
"step": 434
},
{
"epoch": 2.290365156181258,
"grad_norm": 1.3880552404176263,
"learning_rate": 1.5862557239240729e-06,
"loss": 0.4387,
"step": 435
},
{
"epoch": 2.2956445226572813,
"grad_norm": 1.360835274455909,
"learning_rate": 1.5638165701536866e-06,
"loss": 0.5115,
"step": 436
},
{
"epoch": 2.300923889133304,
"grad_norm": 1.3843299657391916,
"learning_rate": 1.54150780340315e-06,
"loss": 0.4899,
"step": 437
},
{
"epoch": 2.306203255609327,
"grad_norm": 1.3835648119835473,
"learning_rate": 1.5193302701853674e-06,
"loss": 0.4664,
"step": 438
},
{
"epoch": 2.3114826220853497,
"grad_norm": 1.4640561341524838,
"learning_rate": 1.4972848120335453e-06,
"loss": 0.4609,
"step": 439
},
{
"epoch": 2.316761988561373,
"grad_norm": 1.421258969771172,
"learning_rate": 1.475372265469265e-06,
"loss": 0.4763,
"step": 440
},
{
"epoch": 2.3220413550373955,
"grad_norm": 1.2751966894937146,
"learning_rate": 1.453593461970733e-06,
"loss": 0.4701,
"step": 441
},
{
"epoch": 2.327320721513418,
"grad_norm": 1.340345062295216,
"learning_rate": 1.4319492279412388e-06,
"loss": 0.4832,
"step": 442
},
{
"epoch": 2.3326000879894413,
"grad_norm": 1.4159387316202012,
"learning_rate": 1.410440384677791e-06,
"loss": 0.42,
"step": 443
},
{
"epoch": 2.337879454465464,
"grad_norm": 1.2648065651654534,
"learning_rate": 1.389067748339954e-06,
"loss": 0.4404,
"step": 444
},
{
"epoch": 2.343158820941487,
"grad_norm": 1.3711182773880273,
"learning_rate": 1.3678321299188802e-06,
"loss": 0.4388,
"step": 445
},
{
"epoch": 2.3484381874175098,
"grad_norm": 1.3906995330557852,
"learning_rate": 1.3467343352065349e-06,
"loss": 0.5312,
"step": 446
},
{
"epoch": 2.353717553893533,
"grad_norm": 1.397658739943819,
"learning_rate": 1.3257751647651223e-06,
"loss": 0.4675,
"step": 447
},
{
"epoch": 2.3589969203695555,
"grad_norm": 1.4627906940091926,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.4395,
"step": 448
},
{
"epoch": 2.3642762868455787,
"grad_norm": 1.381779631997442,
"learning_rate": 1.2842758726130283e-06,
"loss": 0.569,
"step": 449
},
{
"epoch": 2.3695556533216013,
"grad_norm": 1.4402592673817487,
"learning_rate": 1.2637373256055445e-06,
"loss": 0.4903,
"step": 450
},
{
"epoch": 2.3748350197976245,
"grad_norm": 1.3333412823689215,
"learning_rate": 1.2433405522156334e-06,
"loss": 0.4824,
"step": 451
},
{
"epoch": 2.380114386273647,
"grad_norm": 1.4106276931723192,
"learning_rate": 1.2230863264050308e-06,
"loss": 0.487,
"step": 452
},
{
"epoch": 2.3853937527496702,
"grad_norm": 1.3557673199870695,
"learning_rate": 1.202975416726464e-06,
"loss": 0.5265,
"step": 453
},
{
"epoch": 2.390673119225693,
"grad_norm": 1.4343260095491823,
"learning_rate": 1.1830085862944851e-06,
"loss": 0.449,
"step": 454
},
{
"epoch": 2.3959524857017156,
"grad_norm": 1.446767986226991,
"learning_rate": 1.163186592756515e-06,
"loss": 0.4699,
"step": 455
},
{
"epoch": 2.4012318521777387,
"grad_norm": 1.424117946851896,
"learning_rate": 1.1435101882640964e-06,
"loss": 0.4514,
"step": 456
},
{
"epoch": 2.4065112186537614,
"grad_norm": 1.5365862656275142,
"learning_rate": 1.1239801194443507e-06,
"loss": 0.4373,
"step": 457
},
{
"epoch": 2.4117905851297845,
"grad_norm": 1.2648145260275343,
"learning_rate": 1.1045971273716476e-06,
"loss": 0.4329,
"step": 458
},
{
"epoch": 2.417069951605807,
"grad_norm": 1.3311965815866447,
"learning_rate": 1.085361947539486e-06,
"loss": 0.4769,
"step": 459
},
{
"epoch": 2.4223493180818303,
"grad_norm": 1.3975266216041633,
"learning_rate": 1.066275309832584e-06,
"loss": 0.472,
"step": 460
},
{
"epoch": 2.427628684557853,
"grad_norm": 1.3252996893653324,
"learning_rate": 1.0473379384991833e-06,
"loss": 0.4243,
"step": 461
},
{
"epoch": 2.432908051033876,
"grad_norm": 1.4139994351082152,
"learning_rate": 1.02855055212357e-06,
"loss": 0.4785,
"step": 462
},
{
"epoch": 2.4381874175098988,
"grad_norm": 1.3372117034643396,
"learning_rate": 1.0099138635988026e-06,
"loss": 0.4215,
"step": 463
},
{
"epoch": 2.443466783985922,
"grad_norm": 1.3606372944047547,
"learning_rate": 9.91428580099667e-07,
"loss": 0.4413,
"step": 464
},
{
"epoch": 2.4487461504619445,
"grad_norm": 1.318507119449003,
"learning_rate": 9.73095403055837e-07,
"loss": 0.415,
"step": 465
},
{
"epoch": 2.4540255169379677,
"grad_norm": 1.2123750888837692,
"learning_rate": 9.549150281252633e-07,
"loss": 0.4886,
"step": 466
},
{
"epoch": 2.4593048834139903,
"grad_norm": 1.3528666735660853,
"learning_rate": 9.368881451677725e-07,
"loss": 0.4838,
"step": 467
},
{
"epoch": 2.464584249890013,
"grad_norm": 1.3418881482008247,
"learning_rate": 9.190154382188921e-07,
"loss": 0.4466,
"step": 468
},
{
"epoch": 2.469863616366036,
"grad_norm": 1.3296751447001665,
"learning_rate": 9.01297585463895e-07,
"loss": 0.4595,
"step": 469
},
{
"epoch": 2.475142982842059,
"grad_norm": 1.4089808402301305,
"learning_rate": 8.837352592120646e-07,
"loss": 0.4365,
"step": 470
},
{
"epoch": 2.480422349318082,
"grad_norm": 1.402157984134382,
"learning_rate": 8.663291258711831e-07,
"loss": 0.4776,
"step": 471
},
{
"epoch": 2.4857017157941046,
"grad_norm": 1.334493882835527,
"learning_rate": 8.490798459222477e-07,
"loss": 0.446,
"step": 472
},
{
"epoch": 2.4909810822701277,
"grad_norm": 1.4469360566334513,
"learning_rate": 8.31988073894403e-07,
"loss": 0.5585,
"step": 473
},
{
"epoch": 2.4962604487461504,
"grad_norm": 1.3547614775330397,
"learning_rate": 8.150544583401116e-07,
"loss": 0.4951,
"step": 474
},
{
"epoch": 2.5015398152221735,
"grad_norm": 1.4317273864472844,
"learning_rate": 7.98279641810537e-07,
"loss": 0.4658,
"step": 475
},
{
"epoch": 2.506819181698196,
"grad_norm": 1.3305013073280645,
"learning_rate": 7.816642608311692e-07,
"loss": 0.5777,
"step": 476
},
{
"epoch": 2.512098548174219,
"grad_norm": 1.3136690941260454,
"learning_rate": 7.652089458776651e-07,
"loss": 0.499,
"step": 477
},
{
"epoch": 2.517377914650242,
"grad_norm": 1.3874569359917572,
"learning_rate": 7.489143213519301e-07,
"loss": 0.5347,
"step": 478
},
{
"epoch": 2.522657281126265,
"grad_norm": 1.447251018353361,
"learning_rate": 7.327810055584211e-07,
"loss": 0.435,
"step": 479
},
{
"epoch": 2.5279366476022878,
"grad_norm": 1.3496149617623217,
"learning_rate": 7.168096106806871e-07,
"loss": 0.4171,
"step": 480
},
{
"epoch": 2.5332160140783104,
"grad_norm": 1.249564521876932,
"learning_rate": 7.010007427581378e-07,
"loss": 0.4364,
"step": 481
},
{
"epoch": 2.5384953805543335,
"grad_norm": 1.16664480296305,
"learning_rate": 6.853550016630517e-07,
"loss": 0.4704,
"step": 482
},
{
"epoch": 2.543774747030356,
"grad_norm": 1.360240021411605,
"learning_rate": 6.698729810778065e-07,
"loss": 0.4452,
"step": 483
},
{
"epoch": 2.5490541135063793,
"grad_norm": 1.342403768766378,
"learning_rate": 6.545552684723583e-07,
"loss": 0.4693,
"step": 484
},
{
"epoch": 2.554333479982402,
"grad_norm": 1.3325018963351474,
"learning_rate": 6.394024450819458e-07,
"loss": 0.6651,
"step": 485
},
{
"epoch": 2.559612846458425,
"grad_norm": 1.3913165441700324,
"learning_rate": 6.244150858850368e-07,
"loss": 0.4975,
"step": 486
},
{
"epoch": 2.564892212934448,
"grad_norm": 1.2628490579653824,
"learning_rate": 6.095937595815104e-07,
"loss": 0.492,
"step": 487
},
{
"epoch": 2.570171579410471,
"grad_norm": 1.2910442047354849,
"learning_rate": 5.949390285710777e-07,
"loss": 0.4534,
"step": 488
},
{
"epoch": 2.5754509458864936,
"grad_norm": 1.3249433260471921,
"learning_rate": 5.804514489319402e-07,
"loss": 0.487,
"step": 489
},
{
"epoch": 2.5807303123625163,
"grad_norm": 1.4072341388559009,
"learning_rate": 5.661315703996905e-07,
"loss": 0.4675,
"step": 490
},
{
"epoch": 2.5860096788385394,
"grad_norm": 1.4015735453825087,
"learning_rate": 5.519799363464523e-07,
"loss": 0.4845,
"step": 491
},
{
"epoch": 2.5912890453145625,
"grad_norm": 1.4217891239143823,
"learning_rate": 5.379970837602611e-07,
"loss": 0.4998,
"step": 492
},
{
"epoch": 2.596568411790585,
"grad_norm": 1.3648246182499375,
"learning_rate": 5.241835432246888e-07,
"loss": 0.5176,
"step": 493
},
{
"epoch": 2.601847778266608,
"grad_norm": 1.414958969535076,
"learning_rate": 5.105398388987098e-07,
"loss": 0.4967,
"step": 494
},
{
"epoch": 2.607127144742631,
"grad_norm": 1.3790275298806813,
"learning_rate": 4.970664884968135e-07,
"loss": 0.4526,
"step": 495
},
{
"epoch": 2.6124065112186536,
"grad_norm": 1.4260512881673313,
"learning_rate": 4.837640032693558e-07,
"loss": 0.4988,
"step": 496
},
{
"epoch": 2.6176858776946768,
"grad_norm": 1.4375522094160005,
"learning_rate": 4.7063288798316397e-07,
"loss": 0.5034,
"step": 497
},
{
"epoch": 2.6229652441706994,
"grad_norm": 1.2795249845130867,
"learning_rate": 4.576736409023813e-07,
"loss": 0.4697,
"step": 498
},
{
"epoch": 2.6282446106467225,
"grad_norm": 1.341408752696913,
"learning_rate": 4.448867537695578e-07,
"loss": 0.4577,
"step": 499
},
{
"epoch": 2.633523977122745,
"grad_norm": 1.388819232620493,
"learning_rate": 4.322727117869951e-07,
"loss": 0.4578,
"step": 500
},
{
"epoch": 2.6388033435987683,
"grad_norm": 1.4259575315112532,
"learning_rate": 4.198319935983325e-07,
"loss": 0.432,
"step": 501
},
{
"epoch": 2.644082710074791,
"grad_norm": 1.3388679638482945,
"learning_rate": 4.0756507127038494e-07,
"loss": 0.4297,
"step": 502
},
{
"epoch": 2.6493620765508137,
"grad_norm": 1.343061176539468,
"learning_rate": 3.9547241027523164e-07,
"loss": 0.4731,
"step": 503
},
{
"epoch": 2.654641443026837,
"grad_norm": 1.3358210629083995,
"learning_rate": 3.8355446947255293e-07,
"loss": 0.3901,
"step": 504
},
{
"epoch": 2.65992080950286,
"grad_norm": 1.2293751714391306,
"learning_rate": 3.71811701092219e-07,
"loss": 0.4707,
"step": 505
},
{
"epoch": 2.6652001759788826,
"grad_norm": 1.3129862851940244,
"learning_rate": 3.602445507171276e-07,
"loss": 0.4352,
"step": 506
},
{
"epoch": 2.6704795424549053,
"grad_norm": 1.4034927380523827,
"learning_rate": 3.488534572662994e-07,
"loss": 0.4641,
"step": 507
},
{
"epoch": 2.6757589089309284,
"grad_norm": 1.2489349783536317,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.4681,
"step": 508
},
{
"epoch": 2.681038275406951,
"grad_norm": 1.338257980632932,
"learning_rate": 3.266011633944477e-07,
"loss": 0.4466,
"step": 509
},
{
"epoch": 2.686317641882974,
"grad_norm": 1.340765808641228,
"learning_rate": 3.1574080734344757e-07,
"loss": 0.4427,
"step": 510
},
{
"epoch": 2.691597008358997,
"grad_norm": 1.3488826792679693,
"learning_rate": 3.0505819692471797e-07,
"loss": 0.4425,
"step": 511
},
{
"epoch": 2.69687637483502,
"grad_norm": 1.2448047137351241,
"learning_rate": 2.9455373749314285e-07,
"loss": 0.5045,
"step": 512
},
{
"epoch": 2.7021557413110426,
"grad_norm": 1.327982717505523,
"learning_rate": 2.842278276436128e-07,
"loss": 0.4434,
"step": 513
},
{
"epoch": 2.7074351077870658,
"grad_norm": 1.3317380787717712,
"learning_rate": 2.7408085919590265e-07,
"loss": 0.4685,
"step": 514
},
{
"epoch": 2.7127144742630884,
"grad_norm": 1.418889669781179,
"learning_rate": 2.6411321717979886e-07,
"loss": 0.4459,
"step": 515
},
{
"epoch": 2.717993840739111,
"grad_norm": 1.4212898616431393,
"learning_rate": 2.5432527982049424e-07,
"loss": 0.4436,
"step": 516
},
{
"epoch": 2.723273207215134,
"grad_norm": 1.372505212467702,
"learning_rate": 2.447174185242324e-07,
"loss": 0.4942,
"step": 517
},
{
"epoch": 2.7285525736911573,
"grad_norm": 1.429171310323447,
"learning_rate": 2.3528999786421758e-07,
"loss": 0.4636,
"step": 518
},
{
"epoch": 2.73383194016718,
"grad_norm": 1.338261834173231,
"learning_rate": 2.2604337556677846e-07,
"loss": 0.4656,
"step": 519
},
{
"epoch": 2.7391113066432027,
"grad_norm": 1.324849203082196,
"learning_rate": 2.1697790249779638e-07,
"loss": 0.4788,
"step": 520
},
{
"epoch": 2.744390673119226,
"grad_norm": 1.350337658446117,
"learning_rate": 2.080939226493889e-07,
"loss": 0.4846,
"step": 521
},
{
"epoch": 2.7496700395952485,
"grad_norm": 1.3500445059182393,
"learning_rate": 1.9939177312685963e-07,
"loss": 0.4419,
"step": 522
},
{
"epoch": 2.7549494060712716,
"grad_norm": 1.375463337098407,
"learning_rate": 1.908717841359048e-07,
"loss": 0.4687,
"step": 523
},
{
"epoch": 2.7602287725472943,
"grad_norm": 1.4823623308521863,
"learning_rate": 1.825342789700846e-07,
"loss": 0.4779,
"step": 524
},
{
"epoch": 2.765508139023317,
"grad_norm": 1.4161590791252803,
"learning_rate": 1.7437957399855488e-07,
"loss": 0.4685,
"step": 525
},
{
"epoch": 2.77078750549934,
"grad_norm": 1.2986729971436586,
"learning_rate": 1.664079786540629e-07,
"loss": 0.4771,
"step": 526
},
{
"epoch": 2.776066871975363,
"grad_norm": 1.3827623556792923,
"learning_rate": 1.5861979542120598e-07,
"loss": 0.4634,
"step": 527
},
{
"epoch": 2.781346238451386,
"grad_norm": 1.3339794328080277,
"learning_rate": 1.510153198249531e-07,
"loss": 0.4435,
"step": 528
},
{
"epoch": 2.7866256049274085,
"grad_norm": 1.3094435959629867,
"learning_rate": 1.435948404194304e-07,
"loss": 0.4537,
"step": 529
},
{
"epoch": 2.7919049714034316,
"grad_norm": 1.395651401211996,
"learning_rate": 1.363586387769761e-07,
"loss": 0.4729,
"step": 530
},
{
"epoch": 2.7971843378794543,
"grad_norm": 1.3803361937212524,
"learning_rate": 1.2930698947744957e-07,
"loss": 0.5551,
"step": 531
},
{
"epoch": 2.8024637043554774,
"grad_norm": 1.403170004866064,
"learning_rate": 1.22440160097817e-07,
"loss": 0.4659,
"step": 532
},
{
"epoch": 2.8077430708315,
"grad_norm": 1.4326502602660136,
"learning_rate": 1.157584112019966e-07,
"loss": 0.4532,
"step": 533
},
{
"epoch": 2.813022437307523,
"grad_norm": 1.413170997771312,
"learning_rate": 1.0926199633097156e-07,
"loss": 0.5077,
"step": 534
},
{
"epoch": 2.818301803783546,
"grad_norm": 1.4978501520243235,
"learning_rate": 1.0295116199317057e-07,
"loss": 0.4524,
"step": 535
},
{
"epoch": 2.823581170259569,
"grad_norm": 1.3811339585897102,
"learning_rate": 9.682614765511134e-08,
"loss": 0.4267,
"step": 536
},
{
"epoch": 2.8288605367355917,
"grad_norm": 1.2532370949093539,
"learning_rate": 9.08871857323157e-08,
"loss": 0.5053,
"step": 537
},
{
"epoch": 2.8341399032116144,
"grad_norm": 1.3788001439277469,
"learning_rate": 8.513450158049109e-08,
"loss": 0.4377,
"step": 538
},
{
"epoch": 2.8394192696876375,
"grad_norm": 1.3386655810754182,
"learning_rate": 7.956831348697791e-08,
"loss": 0.5369,
"step": 539
},
{
"epoch": 2.8446986361636606,
"grad_norm": 1.4031016680793478,
"learning_rate": 7.418883266246734e-08,
"loss": 0.4692,
"step": 540
},
{
"epoch": 2.8499780026396833,
"grad_norm": 1.4223612412457645,
"learning_rate": 6.899626323298714e-08,
"loss": 0.4975,
"step": 541
},
{
"epoch": 2.855257369115706,
"grad_norm": 1.468568132224329,
"learning_rate": 6.399080223215503e-08,
"loss": 0.4278,
"step": 542
},
{
"epoch": 2.860536735591729,
"grad_norm": 1.2905228846748407,
"learning_rate": 5.917263959370312e-08,
"loss": 0.4459,
"step": 543
},
{
"epoch": 2.8658161020677517,
"grad_norm": 1.30438415852383,
"learning_rate": 5.454195814427021e-08,
"loss": 0.4881,
"step": 544
},
{
"epoch": 2.871095468543775,
"grad_norm": 1.4658032992281533,
"learning_rate": 5.009893359646523e-08,
"loss": 0.422,
"step": 545
},
{
"epoch": 2.8763748350197975,
"grad_norm": 1.403126172079016,
"learning_rate": 4.584373454219859e-08,
"loss": 0.4298,
"step": 546
},
{
"epoch": 2.8816542014958206,
"grad_norm": 1.305238740579575,
"learning_rate": 4.177652244628627e-08,
"loss": 0.447,
"step": 547
},
{
"epoch": 2.8869335679718433,
"grad_norm": 1.4226875729692003,
"learning_rate": 3.7897451640321326e-08,
"loss": 0.537,
"step": 548
},
{
"epoch": 2.8922129344478664,
"grad_norm": 1.3043310631991534,
"learning_rate": 3.4206669316819155e-08,
"loss": 0.495,
"step": 549
},
{
"epoch": 2.897492300923889,
"grad_norm": 1.4379823363129705,
"learning_rate": 3.0704315523631956e-08,
"loss": 0.4139,
"step": 550
},
{
"epoch": 2.9027716673999118,
"grad_norm": 1.3544951690739686,
"learning_rate": 2.7390523158633552e-08,
"loss": 0.5213,
"step": 551
},
{
"epoch": 2.908051033875935,
"grad_norm": 1.5078288720536737,
"learning_rate": 2.426541796467785e-08,
"loss": 0.4605,
"step": 552
},
{
"epoch": 2.913330400351958,
"grad_norm": 1.35610421146911,
"learning_rate": 2.1329118524827662e-08,
"loss": 0.4708,
"step": 553
},
{
"epoch": 2.9186097668279807,
"grad_norm": 1.4726737456407306,
"learning_rate": 1.8581736257852756e-08,
"loss": 0.4868,
"step": 554
},
{
"epoch": 2.9238891333040034,
"grad_norm": 1.353334569052309,
"learning_rate": 1.6023375414004894e-08,
"loss": 0.4867,
"step": 555
},
{
"epoch": 2.9291684997800265,
"grad_norm": 1.3416226125226052,
"learning_rate": 1.3654133071059894e-08,
"loss": 0.4396,
"step": 556
},
{
"epoch": 2.934447866256049,
"grad_norm": 1.226026839315963,
"learning_rate": 1.1474099130635575e-08,
"loss": 0.4901,
"step": 557
},
{
"epoch": 2.9397272327320723,
"grad_norm": 1.3404474468548127,
"learning_rate": 9.48335631477948e-09,
"loss": 0.4651,
"step": 558
},
{
"epoch": 2.945006599208095,
"grad_norm": 1.3923456514703572,
"learning_rate": 7.681980162830283e-09,
"loss": 0.5356,
"step": 559
},
{
"epoch": 2.950285965684118,
"grad_norm": 1.2760505962703481,
"learning_rate": 6.070039028550634e-09,
"loss": 0.5385,
"step": 560
},
{
"epoch": 2.9555653321601407,
"grad_norm": 2.2171211350170843,
"learning_rate": 4.647594077534235e-09,
"loss": 0.5793,
"step": 561
},
{
"epoch": 2.960844698636164,
"grad_norm": 1.5115950710680348,
"learning_rate": 3.41469928488547e-09,
"loss": 0.5072,
"step": 562
},
{
"epoch": 2.9661240651121865,
"grad_norm": 1.386901569403929,
"learning_rate": 2.371401433170495e-09,
"loss": 0.4858,
"step": 563
},
{
"epoch": 2.971403431588209,
"grad_norm": 1.4637428485442363,
"learning_rate": 1.5177401106419853e-09,
"loss": 0.4478,
"step": 564
},
{
"epoch": 2.9766827980642323,
"grad_norm": 1.4551106718059847,
"learning_rate": 8.537477097364522e-10,
"loss": 0.4673,
"step": 565
},
{
"epoch": 2.9819621645402554,
"grad_norm": 1.3415126297649902,
"learning_rate": 3.7944942584688947e-10,
"loss": 0.4997,
"step": 566
},
{
"epoch": 2.987241531016278,
"grad_norm": 1.2328749287439438,
"learning_rate": 9.486325636576254e-11,
"loss": 0.5271,
"step": 567
},
{
"epoch": 2.987241531016278,
"step": 567,
"total_flos": 4.6211312738788966e+17,
"train_loss": 0.7107904120832944,
"train_runtime": 62456.134,
"train_samples_per_second": 0.437,
"train_steps_per_second": 0.009
}
],
"logging_steps": 1,
"max_steps": 567,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.6211312738788966e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}