modeltest / trainer_state.json
Wannabtl's picture
Upload folder using huggingface_hub
c819db6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3520179920307038,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00039113110225633753,
"grad_norm": 73.33983612060547,
"learning_rate": 0.0,
"loss": 9022.7637,
"step": 1
},
{
"epoch": 0.0007822622045126751,
"grad_norm": 341.7502746582031,
"learning_rate": 5.555555555555555e-07,
"loss": 13173.7656,
"step": 2
},
{
"epoch": 0.0011733933067690125,
"grad_norm": 87.00965881347656,
"learning_rate": 1.111111111111111e-06,
"loss": 9796.1172,
"step": 3
},
{
"epoch": 0.0015645244090253501,
"grad_norm": 17.399965286254883,
"learning_rate": 1.6666666666666667e-06,
"loss": 11499.0176,
"step": 4
},
{
"epoch": 0.0019556555112816877,
"grad_norm": 13.876103401184082,
"learning_rate": 2.222222222222222e-06,
"loss": 8862.1523,
"step": 5
},
{
"epoch": 0.002346786613538025,
"grad_norm": 79.78038787841797,
"learning_rate": 2.7777777777777783e-06,
"loss": 9754.3584,
"step": 6
},
{
"epoch": 0.002737917715794363,
"grad_norm": 32.10177230834961,
"learning_rate": 3.3333333333333333e-06,
"loss": 6731.0986,
"step": 7
},
{
"epoch": 0.0031290488180507003,
"grad_norm": 35.9862174987793,
"learning_rate": 3.88888888888889e-06,
"loss": 11245.0312,
"step": 8
},
{
"epoch": 0.003520179920307038,
"grad_norm": 28.863813400268555,
"learning_rate": 4.444444444444444e-06,
"loss": 6245.1196,
"step": 9
},
{
"epoch": 0.003911311022563375,
"grad_norm": 42.70909881591797,
"learning_rate": 5e-06,
"loss": 6062.3369,
"step": 10
},
{
"epoch": 0.004302442124819713,
"grad_norm": 8.477494239807129,
"learning_rate": 5.555555555555557e-06,
"loss": 9045.1113,
"step": 11
},
{
"epoch": 0.00469357322707605,
"grad_norm": 17.350603103637695,
"learning_rate": 6.111111111111112e-06,
"loss": 4973.7451,
"step": 12
},
{
"epoch": 0.005084704329332388,
"grad_norm": 10729.576171875,
"learning_rate": 6.666666666666667e-06,
"loss": 2556.2378,
"step": 13
},
{
"epoch": 0.005475835431588726,
"grad_norm": 12796.4111328125,
"learning_rate": 7.222222222222223e-06,
"loss": 2469.0479,
"step": 14
},
{
"epoch": 0.005866966533845063,
"grad_norm": 11690.9521484375,
"learning_rate": 7.77777777777778e-06,
"loss": 1900.1685,
"step": 15
},
{
"epoch": 0.0062580976361014005,
"grad_norm": 10036.77734375,
"learning_rate": 8.333333333333334e-06,
"loss": 2600.8911,
"step": 16
},
{
"epoch": 0.006649228738357738,
"grad_norm": 10.607966423034668,
"learning_rate": 8.888888888888888e-06,
"loss": 871.7581,
"step": 17
},
{
"epoch": 0.007040359840614076,
"grad_norm": 19968.548828125,
"learning_rate": 9.444444444444445e-06,
"loss": 472.9429,
"step": 18
},
{
"epoch": 0.0074314909428704135,
"grad_norm": 15.735692024230957,
"learning_rate": 1e-05,
"loss": 2482.8877,
"step": 19
},
{
"epoch": 0.00782262204512675,
"grad_norm": 6.307197570800781,
"learning_rate": 9.999968282268043e-06,
"loss": 538.6899,
"step": 20
},
{
"epoch": 0.008213753147383089,
"grad_norm": 7844.2470703125,
"learning_rate": 9.999873129474573e-06,
"loss": 1317.9186,
"step": 21
},
{
"epoch": 0.008604884249639426,
"grad_norm": 1.590285062789917,
"learning_rate": 9.999714542826806e-06,
"loss": 641.1613,
"step": 22
},
{
"epoch": 0.008996015351895764,
"grad_norm": 7149.05322265625,
"learning_rate": 9.999492524336743e-06,
"loss": 821.1104,
"step": 23
},
{
"epoch": 0.0093871464541521,
"grad_norm": 3.6406304836273193,
"learning_rate": 9.999207076821155e-06,
"loss": 1000.1948,
"step": 24
},
{
"epoch": 0.009778277556408439,
"grad_norm": 3.2047431468963623,
"learning_rate": 9.99885820390154e-06,
"loss": 418.2515,
"step": 25
},
{
"epoch": 0.010169408658664777,
"grad_norm": 2.1247236728668213,
"learning_rate": 9.998445910004082e-06,
"loss": 206.9073,
"step": 26
},
{
"epoch": 0.010560539760921113,
"grad_norm": 6176.64404296875,
"learning_rate": 9.997970200359592e-06,
"loss": 277.8287,
"step": 27
},
{
"epoch": 0.010951670863177452,
"grad_norm": 0.7870343327522278,
"learning_rate": 9.99743108100344e-06,
"loss": 28.0133,
"step": 28
},
{
"epoch": 0.011342801965433788,
"grad_norm": 2.914349317550659,
"learning_rate": 9.996828558775486e-06,
"loss": 528.6738,
"step": 29
},
{
"epoch": 0.011733933067690126,
"grad_norm": 5678.888671875,
"learning_rate": 9.996162641319985e-06,
"loss": 173.2077,
"step": 30
},
{
"epoch": 0.012125064169946465,
"grad_norm": 1.8666399717330933,
"learning_rate": 9.995433337085492e-06,
"loss": 227.239,
"step": 31
},
{
"epoch": 0.012516195272202801,
"grad_norm": 2.4892313480377197,
"learning_rate": 9.994640655324758e-06,
"loss": 32.1116,
"step": 32
},
{
"epoch": 0.01290732637445914,
"grad_norm": 1591.641357421875,
"learning_rate": 9.993784606094612e-06,
"loss": 211.6248,
"step": 33
},
{
"epoch": 0.013298457476715476,
"grad_norm": 10072.65234375,
"learning_rate": 9.992865200255829e-06,
"loss": 374.1296,
"step": 34
},
{
"epoch": 0.013689588578971814,
"grad_norm": 4825.43408203125,
"learning_rate": 9.991882449472994e-06,
"loss": 444.3439,
"step": 35
},
{
"epoch": 0.014080719681228152,
"grad_norm": 30619.453125,
"learning_rate": 9.99083636621436e-06,
"loss": 1014.899,
"step": 36
},
{
"epoch": 0.014471850783484489,
"grad_norm": 0.7788718342781067,
"learning_rate": 9.989726963751683e-06,
"loss": 637.2323,
"step": 37
},
{
"epoch": 0.014862981885740827,
"grad_norm": 0.37763819098472595,
"learning_rate": 9.988554256160052e-06,
"loss": 338.0694,
"step": 38
},
{
"epoch": 0.015254112987997163,
"grad_norm": 5639.20166015625,
"learning_rate": 9.987318258317718e-06,
"loss": 280.443,
"step": 39
},
{
"epoch": 0.0156452440902535,
"grad_norm": 0.34363701939582825,
"learning_rate": 9.986018985905901e-06,
"loss": 127.4894,
"step": 40
},
{
"epoch": 0.01603637519250984,
"grad_norm": 1.3454755544662476,
"learning_rate": 9.984656455408591e-06,
"loss": 227.2196,
"step": 41
},
{
"epoch": 0.016427506294766178,
"grad_norm": 1.1212941408157349,
"learning_rate": 9.983230684112338e-06,
"loss": 388.5173,
"step": 42
},
{
"epoch": 0.016818637397022513,
"grad_norm": 4135.07958984375,
"learning_rate": 9.981741690106035e-06,
"loss": 277.9473,
"step": 43
},
{
"epoch": 0.01720976849927885,
"grad_norm": 0.3415931761264801,
"learning_rate": 9.980189492280688e-06,
"loss": 141.4491,
"step": 44
},
{
"epoch": 0.01760089960153519,
"grad_norm": 0.3497358560562134,
"learning_rate": 9.978574110329174e-06,
"loss": 125.8424,
"step": 45
},
{
"epoch": 0.017992030703791528,
"grad_norm": 5.21222448348999,
"learning_rate": 9.976895564745993e-06,
"loss": 693.0908,
"step": 46
},
{
"epoch": 0.018383161806047866,
"grad_norm": 6.805148124694824,
"learning_rate": 9.975153876827008e-06,
"loss": 237.5693,
"step": 47
},
{
"epoch": 0.0187742929083042,
"grad_norm": 0.32790857553482056,
"learning_rate": 9.973349068669178e-06,
"loss": 231.3147,
"step": 48
},
{
"epoch": 0.01916542401056054,
"grad_norm": 1.088236927986145,
"learning_rate": 9.97148116317027e-06,
"loss": 59.4532,
"step": 49
},
{
"epoch": 0.019556555112816877,
"grad_norm": 3.897584915161133,
"learning_rate": 9.969550184028572e-06,
"loss": 181.5394,
"step": 50
},
{
"epoch": 0.019947686215073215,
"grad_norm": 0.36891791224479675,
"learning_rate": 9.9675561557426e-06,
"loss": 274.0787,
"step": 51
},
{
"epoch": 0.020338817317329554,
"grad_norm": 4463.36474609375,
"learning_rate": 9.965499103610775e-06,
"loss": 334.0236,
"step": 52
},
{
"epoch": 0.02072994841958589,
"grad_norm": 1.5177428722381592,
"learning_rate": 9.963379053731104e-06,
"loss": 116.3086,
"step": 53
},
{
"epoch": 0.021121079521842227,
"grad_norm": 0.948329746723175,
"learning_rate": 9.961196033000862e-06,
"loss": 167.3062,
"step": 54
},
{
"epoch": 0.021512210624098565,
"grad_norm": 6488.12744140625,
"learning_rate": 9.95895006911623e-06,
"loss": 280.1733,
"step": 55
},
{
"epoch": 0.021903341726354903,
"grad_norm": 1.1784950494766235,
"learning_rate": 9.956641190571967e-06,
"loss": 211.0869,
"step": 56
},
{
"epoch": 0.02229447282861124,
"grad_norm": 1.0954192876815796,
"learning_rate": 9.954269426661023e-06,
"loss": 108.0197,
"step": 57
},
{
"epoch": 0.022685603930867576,
"grad_norm": 2431.99755859375,
"learning_rate": 9.951834807474191e-06,
"loss": 182.8621,
"step": 58
},
{
"epoch": 0.023076735033123914,
"grad_norm": 0.5318559408187866,
"learning_rate": 9.949337363899709e-06,
"loss": 203.5584,
"step": 59
},
{
"epoch": 0.023467866135380253,
"grad_norm": 611.1079711914062,
"learning_rate": 9.946777127622874e-06,
"loss": 304.5495,
"step": 60
},
{
"epoch": 0.02385899723763659,
"grad_norm": 1.1814780235290527,
"learning_rate": 9.944154131125643e-06,
"loss": 305.7924,
"step": 61
},
{
"epoch": 0.02425012833989293,
"grad_norm": 0.3324390947818756,
"learning_rate": 9.941468407686216e-06,
"loss": 133.2165,
"step": 62
},
{
"epoch": 0.024641259442149264,
"grad_norm": 0.3267706036567688,
"learning_rate": 9.938719991378614e-06,
"loss": 234.4715,
"step": 63
},
{
"epoch": 0.025032390544405602,
"grad_norm": 1.3336584568023682,
"learning_rate": 9.935908917072253e-06,
"loss": 44.1555,
"step": 64
},
{
"epoch": 0.02542352164666194,
"grad_norm": 0.33313995599746704,
"learning_rate": 9.933035220431489e-06,
"loss": 199.4248,
"step": 65
},
{
"epoch": 0.02581465274891828,
"grad_norm": 3202.915771484375,
"learning_rate": 9.930098937915177e-06,
"loss": 142.3462,
"step": 66
},
{
"epoch": 0.026205783851174617,
"grad_norm": 1.4841117858886719,
"learning_rate": 9.927100106776213e-06,
"loss": 334.0991,
"step": 67
},
{
"epoch": 0.02659691495343095,
"grad_norm": 3751.06689453125,
"learning_rate": 9.924038765061042e-06,
"loss": 254.8563,
"step": 68
},
{
"epoch": 0.02698804605568729,
"grad_norm": 0.25357913970947266,
"learning_rate": 9.920914951609189e-06,
"loss": 173.2204,
"step": 69
},
{
"epoch": 0.027379177157943628,
"grad_norm": 619.5060424804688,
"learning_rate": 9.917728706052765e-06,
"loss": 158.5274,
"step": 70
},
{
"epoch": 0.027770308260199966,
"grad_norm": 0.3924911320209503,
"learning_rate": 9.914480068815964e-06,
"loss": 101.5475,
"step": 71
},
{
"epoch": 0.028161439362456304,
"grad_norm": 477.5389404296875,
"learning_rate": 9.91116908111455e-06,
"loss": 150.1975,
"step": 72
},
{
"epoch": 0.02855257046471264,
"grad_norm": 0.28718748688697815,
"learning_rate": 9.907795784955327e-06,
"loss": 209.6969,
"step": 73
},
{
"epoch": 0.028943701566968977,
"grad_norm": 0.29111266136169434,
"learning_rate": 9.90436022313562e-06,
"loss": 153.0687,
"step": 74
},
{
"epoch": 0.029334832669225316,
"grad_norm": 323.9737243652344,
"learning_rate": 9.900862439242719e-06,
"loss": 74.2796,
"step": 75
},
{
"epoch": 0.029725963771481654,
"grad_norm": 2.6136021614074707,
"learning_rate": 9.897302477653334e-06,
"loss": 291.2503,
"step": 76
},
{
"epoch": 0.030117094873737992,
"grad_norm": 0.22756816446781158,
"learning_rate": 9.893680383533027e-06,
"loss": 96.8319,
"step": 77
},
{
"epoch": 0.030508225975994327,
"grad_norm": 0.9840016961097717,
"learning_rate": 9.889996202835642e-06,
"loss": 309.5101,
"step": 78
},
{
"epoch": 0.030899357078250665,
"grad_norm": 0.275417685508728,
"learning_rate": 9.88624998230272e-06,
"loss": 234.1608,
"step": 79
},
{
"epoch": 0.031290488180507,
"grad_norm": 0.7580969929695129,
"learning_rate": 9.882441769462911e-06,
"loss": 237.8195,
"step": 80
},
{
"epoch": 0.03168161928276334,
"grad_norm": 0.76526939868927,
"learning_rate": 9.878571612631364e-06,
"loss": 84.615,
"step": 81
},
{
"epoch": 0.03207275038501968,
"grad_norm": 373.3411560058594,
"learning_rate": 9.874639560909118e-06,
"loss": 300.4078,
"step": 82
},
{
"epoch": 0.03246388148727602,
"grad_norm": 1.3133848905563354,
"learning_rate": 9.870645664182478e-06,
"loss": 131.5505,
"step": 83
},
{
"epoch": 0.032855012589532356,
"grad_norm": 311.3302917480469,
"learning_rate": 9.86658997312238e-06,
"loss": 196.0266,
"step": 84
},
{
"epoch": 0.033246143691788695,
"grad_norm": 0.6496802568435669,
"learning_rate": 9.862472539183757e-06,
"loss": 73.9141,
"step": 85
},
{
"epoch": 0.033637274794045026,
"grad_norm": 459.7312927246094,
"learning_rate": 9.858293414604871e-06,
"loss": 152.4336,
"step": 86
},
{
"epoch": 0.034028405896301364,
"grad_norm": 335.05426025390625,
"learning_rate": 9.854052652406666e-06,
"loss": 239.4406,
"step": 87
},
{
"epoch": 0.0344195369985577,
"grad_norm": 1.7906296253204346,
"learning_rate": 9.849750306392085e-06,
"loss": 129.0228,
"step": 88
},
{
"epoch": 0.03481066810081404,
"grad_norm": 0.2897918224334717,
"learning_rate": 9.84538643114539e-06,
"loss": 150.6289,
"step": 89
},
{
"epoch": 0.03520179920307038,
"grad_norm": 486.94952392578125,
"learning_rate": 9.840961082031473e-06,
"loss": 98.7655,
"step": 90
},
{
"epoch": 0.03559293030532672,
"grad_norm": 1.4039978981018066,
"learning_rate": 9.836474315195148e-06,
"loss": 153.2002,
"step": 91
},
{
"epoch": 0.035984061407583055,
"grad_norm": 2591.096435546875,
"learning_rate": 9.831926187560441e-06,
"loss": 345.5845,
"step": 92
},
{
"epoch": 0.036375192509839394,
"grad_norm": 2290.58740234375,
"learning_rate": 9.827316756829871e-06,
"loss": 95.79,
"step": 93
},
{
"epoch": 0.03676632361209573,
"grad_norm": 2.362971067428589,
"learning_rate": 9.822646081483713e-06,
"loss": 177.6802,
"step": 94
},
{
"epoch": 0.03715745471435207,
"grad_norm": 0.3684510886669159,
"learning_rate": 9.817914220779258e-06,
"loss": 364.415,
"step": 95
},
{
"epoch": 0.0375485858166084,
"grad_norm": 3289.91162109375,
"learning_rate": 9.81312123475006e-06,
"loss": 353.144,
"step": 96
},
{
"epoch": 0.03793971691886474,
"grad_norm": 0.3244830369949341,
"learning_rate": 9.808267184205182e-06,
"loss": 189.2679,
"step": 97
},
{
"epoch": 0.03833084802112108,
"grad_norm": 1.0287890434265137,
"learning_rate": 9.80335213072841e-06,
"loss": 139.2987,
"step": 98
},
{
"epoch": 0.038721979123377416,
"grad_norm": 1508.3958740234375,
"learning_rate": 9.798376136677486e-06,
"loss": 187.994,
"step": 99
},
{
"epoch": 0.039113110225633754,
"grad_norm": 2.16009521484375,
"learning_rate": 9.793339265183303e-06,
"loss": 142.5293,
"step": 100
},
{
"epoch": 0.03950424132789009,
"grad_norm": 0.4904083013534546,
"learning_rate": 9.788241580149123e-06,
"loss": 147.5625,
"step": 101
},
{
"epoch": 0.03989537243014643,
"grad_norm": 1.0540492534637451,
"learning_rate": 9.783083146249749e-06,
"loss": 195.4026,
"step": 102
},
{
"epoch": 0.04028650353240277,
"grad_norm": 602.712158203125,
"learning_rate": 9.777864028930705e-06,
"loss": 101.4185,
"step": 103
},
{
"epoch": 0.04067763463465911,
"grad_norm": 2.976310968399048,
"learning_rate": 9.77258429440742e-06,
"loss": 218.7537,
"step": 104
},
{
"epoch": 0.041068765736915445,
"grad_norm": 2.811608076095581,
"learning_rate": 9.767244009664376e-06,
"loss": 96.5371,
"step": 105
},
{
"epoch": 0.04145989683917178,
"grad_norm": 0.34032607078552246,
"learning_rate": 9.761843242454261e-06,
"loss": 191.7003,
"step": 106
},
{
"epoch": 0.041851027941428115,
"grad_norm": 5016.841796875,
"learning_rate": 9.75638206129711e-06,
"loss": 112.3467,
"step": 107
},
{
"epoch": 0.04224215904368445,
"grad_norm": 0.2334691882133484,
"learning_rate": 9.750860535479434e-06,
"loss": 153.987,
"step": 108
},
{
"epoch": 0.04263329014594079,
"grad_norm": 0.16604840755462646,
"learning_rate": 9.745278735053345e-06,
"loss": 164.6346,
"step": 109
},
{
"epoch": 0.04302442124819713,
"grad_norm": 2.011397361755371,
"learning_rate": 9.73963673083566e-06,
"loss": 96.3216,
"step": 110
},
{
"epoch": 0.04341555235045347,
"grad_norm": 216.0008544921875,
"learning_rate": 9.733934594407012e-06,
"loss": 97.9669,
"step": 111
},
{
"epoch": 0.043806683452709806,
"grad_norm": 781.0093383789062,
"learning_rate": 9.728172398110935e-06,
"loss": 124.8754,
"step": 112
},
{
"epoch": 0.044197814554966144,
"grad_norm": 1206.0252685546875,
"learning_rate": 9.722350215052946e-06,
"loss": 126.4322,
"step": 113
},
{
"epoch": 0.04458894565722248,
"grad_norm": 1861.3472900390625,
"learning_rate": 9.716468119099626e-06,
"loss": 111.0165,
"step": 114
},
{
"epoch": 0.04498007675947882,
"grad_norm": 0.7697561979293823,
"learning_rate": 9.710526184877667e-06,
"loss": 1.3695,
"step": 115
},
{
"epoch": 0.04537120786173515,
"grad_norm": 0.5405284762382507,
"learning_rate": 9.704524487772944e-06,
"loss": 126.1109,
"step": 116
},
{
"epoch": 0.04576233896399149,
"grad_norm": 0.6306818723678589,
"learning_rate": 9.698463103929542e-06,
"loss": 21.7656,
"step": 117
},
{
"epoch": 0.04615347006624783,
"grad_norm": 3005.80712890625,
"learning_rate": 9.692342110248802e-06,
"loss": 127.39,
"step": 118
},
{
"epoch": 0.04654460116850417,
"grad_norm": 392.67950439453125,
"learning_rate": 9.68616158438834e-06,
"loss": 110.3292,
"step": 119
},
{
"epoch": 0.046935732270760505,
"grad_norm": 3.171126365661621,
"learning_rate": 9.679921604761056e-06,
"loss": 132.2241,
"step": 120
},
{
"epoch": 0.04732686337301684,
"grad_norm": 0.4518907368183136,
"learning_rate": 9.673622250534155e-06,
"loss": 136.119,
"step": 121
},
{
"epoch": 0.04771799447527318,
"grad_norm": 2589.1015625,
"learning_rate": 9.66726360162813e-06,
"loss": 381.2232,
"step": 122
},
{
"epoch": 0.04810912557752952,
"grad_norm": 0.1722714751958847,
"learning_rate": 9.660845738715743e-06,
"loss": 97.7159,
"step": 123
},
{
"epoch": 0.04850025667978586,
"grad_norm": 0.5003955364227295,
"learning_rate": 9.654368743221022e-06,
"loss": 50.7664,
"step": 124
},
{
"epoch": 0.048891387782042196,
"grad_norm": 0.28121402859687805,
"learning_rate": 9.647832697318207e-06,
"loss": 93.6975,
"step": 125
},
{
"epoch": 0.04928251888429853,
"grad_norm": 2588.393310546875,
"learning_rate": 9.641237683930722e-06,
"loss": 54.1281,
"step": 126
},
{
"epoch": 0.049673649986554866,
"grad_norm": 522.7401123046875,
"learning_rate": 9.63458378673011e-06,
"loss": 21.376,
"step": 127
},
{
"epoch": 0.050064781088811204,
"grad_norm": 1337.2923583984375,
"learning_rate": 9.627871090134984e-06,
"loss": 97.5455,
"step": 128
},
{
"epoch": 0.05045591219106754,
"grad_norm": 0.17062804102897644,
"learning_rate": 9.621099679309948e-06,
"loss": 66.8805,
"step": 129
},
{
"epoch": 0.05084704329332388,
"grad_norm": 1.8228886127471924,
"learning_rate": 9.61426964016452e-06,
"loss": 90.8759,
"step": 130
},
{
"epoch": 0.05123817439558022,
"grad_norm": 0.9226913452148438,
"learning_rate": 9.60738105935204e-06,
"loss": 57.6078,
"step": 131
},
{
"epoch": 0.05162930549783656,
"grad_norm": 0.695069432258606,
"learning_rate": 9.60043402426857e-06,
"loss": 125.0028,
"step": 132
},
{
"epoch": 0.052020436600092895,
"grad_norm": 1.5678439140319824,
"learning_rate": 9.593428623051793e-06,
"loss": 94.1563,
"step": 133
},
{
"epoch": 0.052411567702349234,
"grad_norm": 1.5755531787872314,
"learning_rate": 9.58636494457988e-06,
"loss": 83.3838,
"step": 134
},
{
"epoch": 0.05280269880460557,
"grad_norm": 0.2734808623790741,
"learning_rate": 9.57924307847038e-06,
"loss": 31.6734,
"step": 135
},
{
"epoch": 0.0531938299068619,
"grad_norm": 1.4930920600891113,
"learning_rate": 9.572063115079063e-06,
"loss": 126.7836,
"step": 136
},
{
"epoch": 0.05358496100911824,
"grad_norm": 0.25422215461730957,
"learning_rate": 9.564825145498795e-06,
"loss": 76.5774,
"step": 137
},
{
"epoch": 0.05397609211137458,
"grad_norm": 163.52911376953125,
"learning_rate": 9.557529261558367e-06,
"loss": 53.7238,
"step": 138
},
{
"epoch": 0.05436722321363092,
"grad_norm": 553.9047241210938,
"learning_rate": 9.550175555821333e-06,
"loss": 76.2747,
"step": 139
},
{
"epoch": 0.054758354315887256,
"grad_norm": 0.18982116878032684,
"learning_rate": 9.542764121584845e-06,
"loss": 289.7023,
"step": 140
},
{
"epoch": 0.055149485418143594,
"grad_norm": 1.4099926948547363,
"learning_rate": 9.53529505287845e-06,
"loss": 43.1031,
"step": 141
},
{
"epoch": 0.05554061652039993,
"grad_norm": 1.5430524349212646,
"learning_rate": 9.527768444462922e-06,
"loss": 4.821,
"step": 142
},
{
"epoch": 0.05593174762265627,
"grad_norm": 0.40151554346084595,
"learning_rate": 9.520184391829037e-06,
"loss": 67.8287,
"step": 143
},
{
"epoch": 0.05632287872491261,
"grad_norm": 0.22733083367347717,
"learning_rate": 9.512542991196377e-06,
"loss": 162.4188,
"step": 144
},
{
"epoch": 0.05671400982716895,
"grad_norm": 0.44649437069892883,
"learning_rate": 9.504844339512096e-06,
"loss": 80.723,
"step": 145
},
{
"epoch": 0.05710514092942528,
"grad_norm": 4683.439453125,
"learning_rate": 9.497088534449707e-06,
"loss": 362.2365,
"step": 146
},
{
"epoch": 0.05749627203168162,
"grad_norm": 1.031704306602478,
"learning_rate": 9.489275674407826e-06,
"loss": 164.8387,
"step": 147
},
{
"epoch": 0.057887403133937955,
"grad_norm": 0.1672774702310562,
"learning_rate": 9.481405858508935e-06,
"loss": 117.6841,
"step": 148
},
{
"epoch": 0.05827853423619429,
"grad_norm": 292.0991516113281,
"learning_rate": 9.473479186598115e-06,
"loss": 86.3267,
"step": 149
},
{
"epoch": 0.05866966533845063,
"grad_norm": 2.811321496963501,
"learning_rate": 9.465495759241793e-06,
"loss": 291.6378,
"step": 150
},
{
"epoch": 0.05906079644070697,
"grad_norm": 3673.74951171875,
"learning_rate": 9.457455677726447e-06,
"loss": 118.6411,
"step": 151
},
{
"epoch": 0.05945192754296331,
"grad_norm": 3.3059866428375244,
"learning_rate": 9.449359044057344e-06,
"loss": 152.2512,
"step": 152
},
{
"epoch": 0.059843058645219646,
"grad_norm": 1.5615016222000122,
"learning_rate": 9.441205960957221e-06,
"loss": 75.4238,
"step": 153
},
{
"epoch": 0.060234189747475984,
"grad_norm": 129.04693603515625,
"learning_rate": 9.432996531865001e-06,
"loss": 98.264,
"step": 154
},
{
"epoch": 0.06062532084973232,
"grad_norm": 1.3238544464111328,
"learning_rate": 9.424730860934474e-06,
"loss": 138.5685,
"step": 155
},
{
"epoch": 0.061016451951988654,
"grad_norm": 0.6455661058425903,
"learning_rate": 9.416409053032971e-06,
"loss": 76.155,
"step": 156
},
{
"epoch": 0.06140758305424499,
"grad_norm": 3.3306515216827393,
"learning_rate": 9.408031213740045e-06,
"loss": 193.0163,
"step": 157
},
{
"epoch": 0.06179871415650133,
"grad_norm": 363.2554626464844,
"learning_rate": 9.399597449346119e-06,
"loss": 71.2268,
"step": 158
},
{
"epoch": 0.06218984525875767,
"grad_norm": 1775.3311767578125,
"learning_rate": 9.391107866851143e-06,
"loss": 102.3627,
"step": 159
},
{
"epoch": 0.062580976361014,
"grad_norm": 0.23126842081546783,
"learning_rate": 9.382562573963238e-06,
"loss": 33.1838,
"step": 160
},
{
"epoch": 0.06297210746327034,
"grad_norm": 425.8428955078125,
"learning_rate": 9.37396167909733e-06,
"loss": 122.2196,
"step": 161
},
{
"epoch": 0.06336323856552668,
"grad_norm": 2.3558237552642822,
"learning_rate": 9.365305291373769e-06,
"loss": 91.5736,
"step": 162
},
{
"epoch": 0.06375436966778301,
"grad_norm": 0.19831174612045288,
"learning_rate": 9.356593520616948e-06,
"loss": 126.7139,
"step": 163
},
{
"epoch": 0.06414550077003936,
"grad_norm": 2583.596435546875,
"learning_rate": 9.347826477353911e-06,
"loss": 114.9668,
"step": 164
},
{
"epoch": 0.06453663187229569,
"grad_norm": 468.4013977050781,
"learning_rate": 9.33900427281295e-06,
"loss": 58.1353,
"step": 165
},
{
"epoch": 0.06492776297455204,
"grad_norm": 0.607861340045929,
"learning_rate": 9.330127018922195e-06,
"loss": 89.4309,
"step": 166
},
{
"epoch": 0.06531889407680837,
"grad_norm": 1.087489128112793,
"learning_rate": 9.321194828308185e-06,
"loss": 57.7468,
"step": 167
},
{
"epoch": 0.06571002517906471,
"grad_norm": 1.4299119710922241,
"learning_rate": 9.312207814294454e-06,
"loss": 131.9059,
"step": 168
},
{
"epoch": 0.06610115628132104,
"grad_norm": 0.32067033648490906,
"learning_rate": 9.303166090900082e-06,
"loss": 113.9032,
"step": 169
},
{
"epoch": 0.06649228738357739,
"grad_norm": 2900.03857421875,
"learning_rate": 9.294069772838253e-06,
"loss": 62.0634,
"step": 170
},
{
"epoch": 0.06688341848583372,
"grad_norm": 2043.025634765625,
"learning_rate": 9.284918975514798e-06,
"loss": 128.1048,
"step": 171
},
{
"epoch": 0.06727454958809005,
"grad_norm": 763.0787963867188,
"learning_rate": 9.275713815026732e-06,
"loss": 100.8595,
"step": 172
},
{
"epoch": 0.0676656806903464,
"grad_norm": 316.2015380859375,
"learning_rate": 9.266454408160779e-06,
"loss": 83.9507,
"step": 173
},
{
"epoch": 0.06805681179260273,
"grad_norm": 1.8424168825149536,
"learning_rate": 9.257140872391895e-06,
"loss": 150.7857,
"step": 174
},
{
"epoch": 0.06844794289485907,
"grad_norm": 3.162083387374878,
"learning_rate": 9.24777332588177e-06,
"loss": 109.7346,
"step": 175
},
{
"epoch": 0.0688390739971154,
"grad_norm": 0.48428380489349365,
"learning_rate": 9.238351887477338e-06,
"loss": 198.0632,
"step": 176
},
{
"epoch": 0.06923020509937175,
"grad_norm": 283.169921875,
"learning_rate": 9.22887667670926e-06,
"loss": 133.4527,
"step": 177
},
{
"epoch": 0.06962133620162808,
"grad_norm": 0.24236121773719788,
"learning_rate": 9.219347813790416e-06,
"loss": 134.3827,
"step": 178
},
{
"epoch": 0.07001246730388443,
"grad_norm": 1.1005642414093018,
"learning_rate": 9.209765419614375e-06,
"loss": 95.9749,
"step": 179
},
{
"epoch": 0.07040359840614076,
"grad_norm": 0.2186761498451233,
"learning_rate": 9.200129615753858e-06,
"loss": 99.3441,
"step": 180
},
{
"epoch": 0.07079472950839709,
"grad_norm": 1.1894441843032837,
"learning_rate": 9.190440524459203e-06,
"loss": 64.4476,
"step": 181
},
{
"epoch": 0.07118586061065343,
"grad_norm": 1906.7816162109375,
"learning_rate": 9.180698268656814e-06,
"loss": 76.6484,
"step": 182
},
{
"epoch": 0.07157699171290977,
"grad_norm": 1806.0740966796875,
"learning_rate": 9.170902971947589e-06,
"loss": 124.0878,
"step": 183
},
{
"epoch": 0.07196812281516611,
"grad_norm": 0.9422973394393921,
"learning_rate": 9.16105475860537e-06,
"loss": 61.4217,
"step": 184
},
{
"epoch": 0.07235925391742244,
"grad_norm": 0.22520415484905243,
"learning_rate": 9.151153753575351e-06,
"loss": 76.0462,
"step": 185
},
{
"epoch": 0.07275038501967879,
"grad_norm": 0.5669053792953491,
"learning_rate": 9.141200082472503e-06,
"loss": 66.0641,
"step": 186
},
{
"epoch": 0.07314151612193512,
"grad_norm": 4875.95751953125,
"learning_rate": 9.131193871579975e-06,
"loss": 363.5605,
"step": 187
},
{
"epoch": 0.07353264722419146,
"grad_norm": 1.5607740879058838,
"learning_rate": 9.121135247847492e-06,
"loss": 163.8841,
"step": 188
},
{
"epoch": 0.0739237783264478,
"grad_norm": 0.15883424878120422,
"learning_rate": 9.111024338889748e-06,
"loss": 99.735,
"step": 189
},
{
"epoch": 0.07431490942870414,
"grad_norm": 1.4388748407363892,
"learning_rate": 9.10086127298478e-06,
"loss": 37.5176,
"step": 190
},
{
"epoch": 0.07470604053096047,
"grad_norm": 5.2772040367126465,
"learning_rate": 9.090646179072352e-06,
"loss": 107.7335,
"step": 191
},
{
"epoch": 0.0750971716332168,
"grad_norm": 1.8319369554519653,
"learning_rate": 9.080379186752304e-06,
"loss": 71.8792,
"step": 192
},
{
"epoch": 0.07548830273547315,
"grad_norm": 0.7392552495002747,
"learning_rate": 9.070060426282924e-06,
"loss": 113.0879,
"step": 193
},
{
"epoch": 0.07587943383772948,
"grad_norm": 1.0116955041885376,
"learning_rate": 9.059690028579285e-06,
"loss": 37.4901,
"step": 194
},
{
"epoch": 0.07627056493998582,
"grad_norm": 0.36633720993995667,
"learning_rate": 9.049268125211577e-06,
"loss": 33.619,
"step": 195
},
{
"epoch": 0.07666169604224216,
"grad_norm": 0.25814440846443176,
"learning_rate": 9.038794848403463e-06,
"loss": 73.8944,
"step": 196
},
{
"epoch": 0.0770528271444985,
"grad_norm": 0.9375834465026855,
"learning_rate": 9.028270331030373e-06,
"loss": 130.3545,
"step": 197
},
{
"epoch": 0.07744395824675483,
"grad_norm": 0.18285518884658813,
"learning_rate": 9.017694706617836e-06,
"loss": 52.1208,
"step": 198
},
{
"epoch": 0.07783508934901118,
"grad_norm": 3008.265869140625,
"learning_rate": 9.007068109339783e-06,
"loss": 67.7978,
"step": 199
},
{
"epoch": 0.07822622045126751,
"grad_norm": 0.1591944545507431,
"learning_rate": 8.996390674016839e-06,
"loss": 56.9001,
"step": 200
},
{
"epoch": 0.07861735155352384,
"grad_norm": 0.13576161861419678,
"learning_rate": 8.985662536114614e-06,
"loss": 136.3152,
"step": 201
},
{
"epoch": 0.07900848265578019,
"grad_norm": 411.30450439453125,
"learning_rate": 8.97488383174199e-06,
"loss": 125.2447,
"step": 202
},
{
"epoch": 0.07939961375803652,
"grad_norm": 0.5793523788452148,
"learning_rate": 8.964054697649389e-06,
"loss": 179.7917,
"step": 203
},
{
"epoch": 0.07979074486029286,
"grad_norm": 1.1756185293197632,
"learning_rate": 8.953175271227042e-06,
"loss": 208.6852,
"step": 204
},
{
"epoch": 0.08018187596254919,
"grad_norm": 0.4390352666378021,
"learning_rate": 8.94224569050324e-06,
"loss": 78.6966,
"step": 205
},
{
"epoch": 0.08057300706480554,
"grad_norm": 0.20057456195354462,
"learning_rate": 8.931266094142588e-06,
"loss": 79.7404,
"step": 206
},
{
"epoch": 0.08096413816706187,
"grad_norm": 2346.697509765625,
"learning_rate": 8.920236621444243e-06,
"loss": 162.8069,
"step": 207
},
{
"epoch": 0.08135526926931821,
"grad_norm": 0.19323071837425232,
"learning_rate": 8.90915741234015e-06,
"loss": 182.8168,
"step": 208
},
{
"epoch": 0.08174640037157455,
"grad_norm": 301.8100891113281,
"learning_rate": 8.89802860739326e-06,
"loss": 75.6532,
"step": 209
},
{
"epoch": 0.08213753147383089,
"grad_norm": 1862.041748046875,
"learning_rate": 8.88685034779576e-06,
"loss": 147.3607,
"step": 210
},
{
"epoch": 0.08252866257608722,
"grad_norm": 0.15555168688297272,
"learning_rate": 8.87562277536726e-06,
"loss": 8.7277,
"step": 211
},
{
"epoch": 0.08291979367834355,
"grad_norm": 0.22444282472133636,
"learning_rate": 8.864346032553016e-06,
"loss": 168.385,
"step": 212
},
{
"epoch": 0.0833109247805999,
"grad_norm": 1.4300802946090698,
"learning_rate": 8.853020262422111e-06,
"loss": 170.4068,
"step": 213
},
{
"epoch": 0.08370205588285623,
"grad_norm": 0.5101105570793152,
"learning_rate": 8.84164560866564e-06,
"loss": 322.9848,
"step": 214
},
{
"epoch": 0.08409318698511258,
"grad_norm": 1.2417665719985962,
"learning_rate": 8.83022221559489e-06,
"loss": 265.6229,
"step": 215
},
{
"epoch": 0.0844843180873689,
"grad_norm": 0.5343866944313049,
"learning_rate": 8.818750228139513e-06,
"loss": 146.003,
"step": 216
},
{
"epoch": 0.08487544918962525,
"grad_norm": 0.30504119396209717,
"learning_rate": 8.807229791845673e-06,
"loss": 37.3566,
"step": 217
},
{
"epoch": 0.08526658029188158,
"grad_norm": 0.8993078470230103,
"learning_rate": 8.795661052874217e-06,
"loss": 83.1912,
"step": 218
},
{
"epoch": 0.08565771139413793,
"grad_norm": 520.2239379882812,
"learning_rate": 8.78404415799881e-06,
"loss": 6.4627,
"step": 219
},
{
"epoch": 0.08604884249639426,
"grad_norm": 1.5063972473144531,
"learning_rate": 8.772379254604074e-06,
"loss": 59.2478,
"step": 220
},
{
"epoch": 0.0864399735986506,
"grad_norm": 0.9105270504951477,
"learning_rate": 8.76066649068372e-06,
"loss": 30.8765,
"step": 221
},
{
"epoch": 0.08683110470090694,
"grad_norm": 1.8939876556396484,
"learning_rate": 8.748906014838672e-06,
"loss": 147.7755,
"step": 222
},
{
"epoch": 0.08722223580316327,
"grad_norm": 0.8894091844558716,
"learning_rate": 8.737097976275177e-06,
"loss": 229.9513,
"step": 223
},
{
"epoch": 0.08761336690541961,
"grad_norm": 0.28113725781440735,
"learning_rate": 8.725242524802919e-06,
"loss": 185.9432,
"step": 224
},
{
"epoch": 0.08800449800767594,
"grad_norm": 2170.448974609375,
"learning_rate": 8.713339810833105e-06,
"loss": 86.6734,
"step": 225
},
{
"epoch": 0.08839562910993229,
"grad_norm": 0.9475433826446533,
"learning_rate": 8.701389985376578e-06,
"loss": 117.112,
"step": 226
},
{
"epoch": 0.08878676021218862,
"grad_norm": 0.6161375641822815,
"learning_rate": 8.689393200041878e-06,
"loss": 75.888,
"step": 227
},
{
"epoch": 0.08917789131444497,
"grad_norm": 1.6829807758331299,
"learning_rate": 8.677349607033336e-06,
"loss": 23.0701,
"step": 228
},
{
"epoch": 0.0895690224167013,
"grad_norm": 0.21339260041713715,
"learning_rate": 8.665259359149132e-06,
"loss": 28.2332,
"step": 229
},
{
"epoch": 0.08996015351895764,
"grad_norm": 2635.137451171875,
"learning_rate": 8.653122609779365e-06,
"loss": 124.7725,
"step": 230
},
{
"epoch": 0.09035128462121397,
"grad_norm": 0.34213894605636597,
"learning_rate": 8.640939512904097e-06,
"loss": 96.0887,
"step": 231
},
{
"epoch": 0.0907424157234703,
"grad_norm": 0.23164384067058563,
"learning_rate": 8.62871022309141e-06,
"loss": 69.6245,
"step": 232
},
{
"epoch": 0.09113354682572665,
"grad_norm": 1.0450925827026367,
"learning_rate": 8.61643489549544e-06,
"loss": 60.8622,
"step": 233
},
{
"epoch": 0.09152467792798298,
"grad_norm": 928.1207275390625,
"learning_rate": 8.604113685854407e-06,
"loss": 200.9607,
"step": 234
},
{
"epoch": 0.09191580903023933,
"grad_norm": 0.23424312472343445,
"learning_rate": 8.591746750488639e-06,
"loss": 37.2375,
"step": 235
},
{
"epoch": 0.09230694013249566,
"grad_norm": 0.17486761510372162,
"learning_rate": 8.579334246298593e-06,
"loss": 92.1142,
"step": 236
},
{
"epoch": 0.092698071234752,
"grad_norm": 0.2754494845867157,
"learning_rate": 8.566876330762861e-06,
"loss": 146.6022,
"step": 237
},
{
"epoch": 0.09308920233700833,
"grad_norm": 0.6534192562103271,
"learning_rate": 8.554373161936176e-06,
"loss": 152.2259,
"step": 238
},
{
"epoch": 0.09348033343926468,
"grad_norm": 0.19244331121444702,
"learning_rate": 8.541824898447399e-06,
"loss": 285.4724,
"step": 239
},
{
"epoch": 0.09387146454152101,
"grad_norm": 0.6852802038192749,
"learning_rate": 8.529231699497512e-06,
"loss": 170.299,
"step": 240
},
{
"epoch": 0.09426259564377736,
"grad_norm": 3.1934654712677,
"learning_rate": 8.516593724857598e-06,
"loss": 167.2633,
"step": 241
},
{
"epoch": 0.09465372674603369,
"grad_norm": 0.19524431228637695,
"learning_rate": 8.503911134866819e-06,
"loss": 103.9347,
"step": 242
},
{
"epoch": 0.09504485784829002,
"grad_norm": 316.60107421875,
"learning_rate": 8.491184090430365e-06,
"loss": 82.532,
"step": 243
},
{
"epoch": 0.09543598895054636,
"grad_norm": 1987.078125,
"learning_rate": 8.478412753017433e-06,
"loss": 205.8121,
"step": 244
},
{
"epoch": 0.0958271200528027,
"grad_norm": 4.607306957244873,
"learning_rate": 8.465597284659163e-06,
"loss": 28.8357,
"step": 245
},
{
"epoch": 0.09621825115505904,
"grad_norm": 317.93145751953125,
"learning_rate": 8.452737847946597e-06,
"loss": 96.3762,
"step": 246
},
{
"epoch": 0.09660938225731537,
"grad_norm": 1839.33740234375,
"learning_rate": 8.439834606028594e-06,
"loss": 81.6538,
"step": 247
},
{
"epoch": 0.09700051335957172,
"grad_norm": 1.588068962097168,
"learning_rate": 8.426887722609787e-06,
"loss": 86.3677,
"step": 248
},
{
"epoch": 0.09739164446182805,
"grad_norm": 0.27696385979652405,
"learning_rate": 8.413897361948484e-06,
"loss": 37.9441,
"step": 249
},
{
"epoch": 0.09778277556408439,
"grad_norm": 0.5097156167030334,
"learning_rate": 8.400863688854598e-06,
"loss": 46.0221,
"step": 250
},
{
"epoch": 0.09817390666634072,
"grad_norm": 0.16560612618923187,
"learning_rate": 8.387786868687549e-06,
"loss": 63.0729,
"step": 251
},
{
"epoch": 0.09856503776859706,
"grad_norm": 0.4086504280567169,
"learning_rate": 8.374667067354164e-06,
"loss": 123.6349,
"step": 252
},
{
"epoch": 0.0989561688708534,
"grad_norm": 2121.2421875,
"learning_rate": 8.361504451306585e-06,
"loss": 79.6353,
"step": 253
},
{
"epoch": 0.09934729997310973,
"grad_norm": 0.5328729152679443,
"learning_rate": 8.34829918754014e-06,
"loss": 57.2165,
"step": 254
},
{
"epoch": 0.09973843107536608,
"grad_norm": 0.5634379386901855,
"learning_rate": 8.335051443591236e-06,
"loss": 80.4085,
"step": 255
},
{
"epoch": 0.10012956217762241,
"grad_norm": 1125.6824951171875,
"learning_rate": 8.321761387535231e-06,
"loss": 95.3526,
"step": 256
},
{
"epoch": 0.10052069327987875,
"grad_norm": 3538.38525390625,
"learning_rate": 8.308429187984298e-06,
"loss": 142.8311,
"step": 257
},
{
"epoch": 0.10091182438213508,
"grad_norm": 0.32026898860931396,
"learning_rate": 8.295055014085289e-06,
"loss": 33.7843,
"step": 258
},
{
"epoch": 0.10130295548439143,
"grad_norm": 3.7499899864196777,
"learning_rate": 8.281639035517591e-06,
"loss": 64.4205,
"step": 259
},
{
"epoch": 0.10169408658664776,
"grad_norm": 0.6029073596000671,
"learning_rate": 8.268181422490969e-06,
"loss": 91.6323,
"step": 260
},
{
"epoch": 0.1020852176889041,
"grad_norm": 481.76361083984375,
"learning_rate": 8.254682345743406e-06,
"loss": 92.7615,
"step": 261
},
{
"epoch": 0.10247634879116044,
"grad_norm": 0.34256768226623535,
"learning_rate": 8.241141976538944e-06,
"loss": 104.3669,
"step": 262
},
{
"epoch": 0.10286747989341677,
"grad_norm": 1.014054775238037,
"learning_rate": 8.227560486665498e-06,
"loss": 30.8866,
"step": 263
},
{
"epoch": 0.10325861099567311,
"grad_norm": 0.3670375943183899,
"learning_rate": 8.213938048432697e-06,
"loss": 89.6379,
"step": 264
},
{
"epoch": 0.10364974209792945,
"grad_norm": 0.2565741539001465,
"learning_rate": 8.200274834669675e-06,
"loss": 81.031,
"step": 265
},
{
"epoch": 0.10404087320018579,
"grad_norm": 0.15868939459323883,
"learning_rate": 8.186571018722894e-06,
"loss": 184.6763,
"step": 266
},
{
"epoch": 0.10443200430244212,
"grad_norm": 1.7641242742538452,
"learning_rate": 8.172826774453937e-06,
"loss": 156.6617,
"step": 267
},
{
"epoch": 0.10482313540469847,
"grad_norm": 2.6388673782348633,
"learning_rate": 8.159042276237308e-06,
"loss": 36.3769,
"step": 268
},
{
"epoch": 0.1052142665069548,
"grad_norm": 0.3692081868648529,
"learning_rate": 8.145217698958213e-06,
"loss": 94.0488,
"step": 269
},
{
"epoch": 0.10560539760921114,
"grad_norm": 1.1230888366699219,
"learning_rate": 8.131353218010347e-06,
"loss": 182.0155,
"step": 270
},
{
"epoch": 0.10599652871146747,
"grad_norm": 2172.541259765625,
"learning_rate": 8.117449009293668e-06,
"loss": 144.515,
"step": 271
},
{
"epoch": 0.1063876598137238,
"grad_norm": 1854.9971923828125,
"learning_rate": 8.10350524921216e-06,
"loss": 135.7373,
"step": 272
},
{
"epoch": 0.10677879091598015,
"grad_norm": 0.37005868554115295,
"learning_rate": 8.089522114671603e-06,
"loss": 71.444,
"step": 273
},
{
"epoch": 0.10716992201823648,
"grad_norm": 1958.9207763671875,
"learning_rate": 8.075499783077321e-06,
"loss": 129.372,
"step": 274
},
{
"epoch": 0.10756105312049283,
"grad_norm": 0.5390088558197021,
"learning_rate": 8.061438432331935e-06,
"loss": 205.9629,
"step": 275
},
{
"epoch": 0.10795218422274916,
"grad_norm": 1.4290964603424072,
"learning_rate": 8.047338240833108e-06,
"loss": 107.3386,
"step": 276
},
{
"epoch": 0.1083433153250055,
"grad_norm": 0.30491700768470764,
"learning_rate": 8.033199387471278e-06,
"loss": 126.0688,
"step": 277
},
{
"epoch": 0.10873444642726184,
"grad_norm": 0.6881037950515747,
"learning_rate": 8.019022051627387e-06,
"loss": 137.38,
"step": 278
},
{
"epoch": 0.10912557752951818,
"grad_norm": 0.5975064039230347,
"learning_rate": 8.004806413170613e-06,
"loss": 49.5408,
"step": 279
},
{
"epoch": 0.10951670863177451,
"grad_norm": 2138.67041015625,
"learning_rate": 7.99055265245608e-06,
"loss": 87.7601,
"step": 280
},
{
"epoch": 0.10990783973403086,
"grad_norm": 0.19253171980381012,
"learning_rate": 7.976260950322572e-06,
"loss": 76.5956,
"step": 281
},
{
"epoch": 0.11029897083628719,
"grad_norm": 1.348046898841858,
"learning_rate": 7.96193148809024e-06,
"loss": 47.8194,
"step": 282
},
{
"epoch": 0.11069010193854352,
"grad_norm": 0.4207615554332733,
"learning_rate": 7.9475644475583e-06,
"loss": 59.7233,
"step": 283
},
{
"epoch": 0.11108123304079986,
"grad_norm": 1892.4029541015625,
"learning_rate": 7.933160011002729e-06,
"loss": 147.8456,
"step": 284
},
{
"epoch": 0.1114723641430562,
"grad_norm": 1.315190076828003,
"learning_rate": 7.918718361173951e-06,
"loss": 93.2988,
"step": 285
},
{
"epoch": 0.11186349524531254,
"grad_norm": 1.3171827793121338,
"learning_rate": 7.904239681294515e-06,
"loss": 44.8592,
"step": 286
},
{
"epoch": 0.11225462634756887,
"grad_norm": 314.198486328125,
"learning_rate": 7.889724155056776e-06,
"loss": 87.7935,
"step": 287
},
{
"epoch": 0.11264575744982522,
"grad_norm": 0.28883472084999084,
"learning_rate": 7.875171966620567e-06,
"loss": 101.7658,
"step": 288
},
{
"epoch": 0.11303688855208155,
"grad_norm": 0.49919599294662476,
"learning_rate": 7.860583300610849e-06,
"loss": 132.276,
"step": 289
},
{
"epoch": 0.1134280196543379,
"grad_norm": 0.644845187664032,
"learning_rate": 7.84595834211538e-06,
"loss": 88.4937,
"step": 290
},
{
"epoch": 0.11381915075659423,
"grad_norm": 0.21764078736305237,
"learning_rate": 7.83129727668237e-06,
"loss": 106.7949,
"step": 291
},
{
"epoch": 0.11421028185885056,
"grad_norm": 0.6382555961608887,
"learning_rate": 7.81660029031811e-06,
"loss": 86.3284,
"step": 292
},
{
"epoch": 0.1146014129611069,
"grad_norm": 3383.526123046875,
"learning_rate": 7.801867569484635e-06,
"loss": 163.3528,
"step": 293
},
{
"epoch": 0.11499254406336323,
"grad_norm": 1.0071310997009277,
"learning_rate": 7.78709930109734e-06,
"loss": 55.8198,
"step": 294
},
{
"epoch": 0.11538367516561958,
"grad_norm": 472.02276611328125,
"learning_rate": 7.772295672522615e-06,
"loss": 68.5582,
"step": 295
},
{
"epoch": 0.11577480626787591,
"grad_norm": 3507.451904296875,
"learning_rate": 7.75745687157547e-06,
"loss": 200.0802,
"step": 296
},
{
"epoch": 0.11616593737013226,
"grad_norm": 2918.291748046875,
"learning_rate": 7.742583086517151e-06,
"loss": 235.3087,
"step": 297
},
{
"epoch": 0.11655706847238859,
"grad_norm": 264.2690124511719,
"learning_rate": 7.727674506052744e-06,
"loss": 46.5808,
"step": 298
},
{
"epoch": 0.11694819957464493,
"grad_norm": 0.6071652173995972,
"learning_rate": 7.712731319328798e-06,
"loss": 66.332,
"step": 299
},
{
"epoch": 0.11733933067690126,
"grad_norm": 415.38214111328125,
"learning_rate": 7.697753715930906e-06,
"loss": 55.6848,
"step": 300
},
{
"epoch": 0.11773046177915761,
"grad_norm": 0.16611334681510925,
"learning_rate": 7.682741885881314e-06,
"loss": 176.503,
"step": 301
},
{
"epoch": 0.11812159288141394,
"grad_norm": 0.9599153995513916,
"learning_rate": 7.667696019636504e-06,
"loss": 12.792,
"step": 302
},
{
"epoch": 0.11851272398367027,
"grad_norm": 1961.807373046875,
"learning_rate": 7.652616308084774e-06,
"loss": 144.7594,
"step": 303
},
{
"epoch": 0.11890385508592662,
"grad_norm": 1706.922119140625,
"learning_rate": 7.637502942543825e-06,
"loss": 77.7838,
"step": 304
},
{
"epoch": 0.11929498618818295,
"grad_norm": 3011.58349609375,
"learning_rate": 7.622356114758328e-06,
"loss": 66.3472,
"step": 305
},
{
"epoch": 0.11968611729043929,
"grad_norm": 0.18727894127368927,
"learning_rate": 7.607176016897491e-06,
"loss": 20.0101,
"step": 306
},
{
"epoch": 0.12007724839269562,
"grad_norm": 747.8145141601562,
"learning_rate": 7.591962841552627e-06,
"loss": 124.6628,
"step": 307
},
{
"epoch": 0.12046837949495197,
"grad_norm": 2174.573486328125,
"learning_rate": 7.576716781734699e-06,
"loss": 122.6966,
"step": 308
},
{
"epoch": 0.1208595105972083,
"grad_norm": 1657.6822509765625,
"learning_rate": 7.561438030871886e-06,
"loss": 90.9553,
"step": 309
},
{
"epoch": 0.12125064169946465,
"grad_norm": 409.1562194824219,
"learning_rate": 7.546126782807117e-06,
"loss": 39.3561,
"step": 310
},
{
"epoch": 0.12164177280172098,
"grad_norm": 0.1211930438876152,
"learning_rate": 7.530783231795615e-06,
"loss": 1.299,
"step": 311
},
{
"epoch": 0.12203290390397731,
"grad_norm": 2307.365234375,
"learning_rate": 7.515407572502438e-06,
"loss": 200.4622,
"step": 312
},
{
"epoch": 0.12242403500623365,
"grad_norm": 0.3849581778049469,
"learning_rate": 7.500000000000001e-06,
"loss": 126.054,
"step": 313
},
{
"epoch": 0.12281516610848998,
"grad_norm": 2264.787109375,
"learning_rate": 7.484560709765605e-06,
"loss": 172.344,
"step": 314
},
{
"epoch": 0.12320629721074633,
"grad_norm": 1.2760062217712402,
"learning_rate": 7.469089897678958e-06,
"loss": 107.9826,
"step": 315
},
{
"epoch": 0.12359742831300266,
"grad_norm": 347.2667236328125,
"learning_rate": 7.453587760019691e-06,
"loss": 123.0772,
"step": 316
},
{
"epoch": 0.123988559415259,
"grad_norm": 0.7116885781288147,
"learning_rate": 7.438054493464859e-06,
"loss": 53.4364,
"step": 317
},
{
"epoch": 0.12437969051751534,
"grad_norm": 5167.86669921875,
"learning_rate": 7.422490295086457e-06,
"loss": 314.3017,
"step": 318
},
{
"epoch": 0.12477082161977168,
"grad_norm": 405.8331604003906,
"learning_rate": 7.406895362348916e-06,
"loss": 157.4734,
"step": 319
},
{
"epoch": 0.125161952722028,
"grad_norm": 289.1830749511719,
"learning_rate": 7.391269893106592e-06,
"loss": 86.5292,
"step": 320
},
{
"epoch": 0.12555308382428434,
"grad_norm": 1.3253761529922485,
"learning_rate": 7.375614085601265e-06,
"loss": 62.8788,
"step": 321
},
{
"epoch": 0.12594421492654068,
"grad_norm": 3807.062744140625,
"learning_rate": 7.359928138459615e-06,
"loss": 193.2301,
"step": 322
},
{
"epoch": 0.12633534602879704,
"grad_norm": 0.38906916975975037,
"learning_rate": 7.344212250690712e-06,
"loss": 87.5832,
"step": 323
},
{
"epoch": 0.12672647713105337,
"grad_norm": 1681.7589111328125,
"learning_rate": 7.328466621683481e-06,
"loss": 167.7496,
"step": 324
},
{
"epoch": 0.1271176082333097,
"grad_norm": 0.22865992784500122,
"learning_rate": 7.312691451204178e-06,
"loss": 42.7028,
"step": 325
},
{
"epoch": 0.12750873933556603,
"grad_norm": 0.14315825700759888,
"learning_rate": 7.296886939393852e-06,
"loss": 41.7926,
"step": 326
},
{
"epoch": 0.1278998704378224,
"grad_norm": 0.7125481963157654,
"learning_rate": 7.281053286765816e-06,
"loss": 150.8858,
"step": 327
},
{
"epoch": 0.12829100154007872,
"grad_norm": 1.3114720582962036,
"learning_rate": 7.265190694203086e-06,
"loss": 200.4679,
"step": 328
},
{
"epoch": 0.12868213264233505,
"grad_norm": 308.97198486328125,
"learning_rate": 7.249299362955846e-06,
"loss": 56.5048,
"step": 329
},
{
"epoch": 0.12907326374459138,
"grad_norm": 337.1923522949219,
"learning_rate": 7.233379494638891e-06,
"loss": 43.4137,
"step": 330
},
{
"epoch": 0.12946439484684774,
"grad_norm": 0.3174319565296173,
"learning_rate": 7.217431291229068e-06,
"loss": 57.2986,
"step": 331
},
{
"epoch": 0.12985552594910407,
"grad_norm": 4877.64306640625,
"learning_rate": 7.201454955062712e-06,
"loss": 295.9178,
"step": 332
},
{
"epoch": 0.1302466570513604,
"grad_norm": 2004.761474609375,
"learning_rate": 7.185450688833083e-06,
"loss": 175.5556,
"step": 333
},
{
"epoch": 0.13063778815361674,
"grad_norm": 0.5494270920753479,
"learning_rate": 7.169418695587791e-06,
"loss": 95.1294,
"step": 334
},
{
"epoch": 0.13102891925587307,
"grad_norm": 0.4806520938873291,
"learning_rate": 7.153359178726222e-06,
"loss": 40.1013,
"step": 335
},
{
"epoch": 0.13142005035812943,
"grad_norm": 0.6679964065551758,
"learning_rate": 7.137272341996958e-06,
"loss": 73.998,
"step": 336
},
{
"epoch": 0.13181118146038576,
"grad_norm": 0.3463609516620636,
"learning_rate": 7.121158389495187e-06,
"loss": 55.503,
"step": 337
},
{
"epoch": 0.1322023125626421,
"grad_norm": 0.4972734749317169,
"learning_rate": 7.10501752566012e-06,
"loss": 93.7803,
"step": 338
},
{
"epoch": 0.13259344366489842,
"grad_norm": 0.16640667617321014,
"learning_rate": 7.088849955272396e-06,
"loss": 118.0719,
"step": 339
},
{
"epoch": 0.13298457476715478,
"grad_norm": 0.20604762434959412,
"learning_rate": 7.072655883451478e-06,
"loss": 135.9177,
"step": 340
},
{
"epoch": 0.1333757058694111,
"grad_norm": 0.49932876229286194,
"learning_rate": 7.056435515653059e-06,
"loss": 161.2835,
"step": 341
},
{
"epoch": 0.13376683697166744,
"grad_norm": 0.6121784448623657,
"learning_rate": 7.040189057666449e-06,
"loss": 12.1418,
"step": 342
},
{
"epoch": 0.13415796807392377,
"grad_norm": 1.3118720054626465,
"learning_rate": 7.023916715611969e-06,
"loss": 122.6702,
"step": 343
},
{
"epoch": 0.1345490991761801,
"grad_norm": 0.12975674867630005,
"learning_rate": 7.007618695938334e-06,
"loss": 165.243,
"step": 344
},
{
"epoch": 0.13494023027843646,
"grad_norm": 1.6560322046279907,
"learning_rate": 6.991295205420028e-06,
"loss": 38.0507,
"step": 345
},
{
"epoch": 0.1353313613806928,
"grad_norm": 0.20794105529785156,
"learning_rate": 6.974946451154694e-06,
"loss": 22.9494,
"step": 346
},
{
"epoch": 0.13572249248294913,
"grad_norm": 0.2563984990119934,
"learning_rate": 6.9585726405604915e-06,
"loss": 109.0126,
"step": 347
},
{
"epoch": 0.13611362358520546,
"grad_norm": 2.3350882530212402,
"learning_rate": 6.942173981373474e-06,
"loss": 25.5488,
"step": 348
},
{
"epoch": 0.13650475468746182,
"grad_norm": 0.6753470301628113,
"learning_rate": 6.925750681644954e-06,
"loss": 81.2919,
"step": 349
},
{
"epoch": 0.13689588578971815,
"grad_norm": 1.4040716886520386,
"learning_rate": 6.90930294973886e-06,
"loss": 140.9022,
"step": 350
},
{
"epoch": 0.13728701689197448,
"grad_norm": 0.4277321696281433,
"learning_rate": 6.892830994329089e-06,
"loss": 86.2413,
"step": 351
},
{
"epoch": 0.1376781479942308,
"grad_norm": 1191.952880859375,
"learning_rate": 6.876335024396872e-06,
"loss": 152.5764,
"step": 352
},
{
"epoch": 0.13806927909648714,
"grad_norm": 0.21168895065784454,
"learning_rate": 6.859815249228106e-06,
"loss": 32.4788,
"step": 353
},
{
"epoch": 0.1384604101987435,
"grad_norm": 352.44512939453125,
"learning_rate": 6.8432718784107145e-06,
"loss": 42.3575,
"step": 354
},
{
"epoch": 0.13885154130099983,
"grad_norm": 0.7770540118217468,
"learning_rate": 6.8267051218319766e-06,
"loss": 177.1307,
"step": 355
},
{
"epoch": 0.13924267240325616,
"grad_norm": 1410.75146484375,
"learning_rate": 6.81011518967587e-06,
"loss": 55.0934,
"step": 356
},
{
"epoch": 0.1396338035055125,
"grad_norm": 0.3775579631328583,
"learning_rate": 6.793502292420402e-06,
"loss": 34.0766,
"step": 357
},
{
"epoch": 0.14002493460776885,
"grad_norm": 3233.5927734375,
"learning_rate": 6.7768666408349445e-06,
"loss": 149.3327,
"step": 358
},
{
"epoch": 0.14041606571002518,
"grad_norm": 1.1655175685882568,
"learning_rate": 6.760208445977551e-06,
"loss": 78.598,
"step": 359
},
{
"epoch": 0.14080719681228152,
"grad_norm": 3.140843629837036,
"learning_rate": 6.743527919192285e-06,
"loss": 51.5391,
"step": 360
},
{
"epoch": 0.14119832791453785,
"grad_norm": 0.5811576843261719,
"learning_rate": 6.726825272106539e-06,
"loss": 56.4923,
"step": 361
},
{
"epoch": 0.14158945901679418,
"grad_norm": 0.12908467650413513,
"learning_rate": 6.710100716628345e-06,
"loss": 57.4381,
"step": 362
},
{
"epoch": 0.14198059011905054,
"grad_norm": 573.8088989257812,
"learning_rate": 6.693354464943689e-06,
"loss": 98.4893,
"step": 363
},
{
"epoch": 0.14237172122130687,
"grad_norm": 117.73070526123047,
"learning_rate": 6.676586729513823e-06,
"loss": 96.0484,
"step": 364
},
{
"epoch": 0.1427628523235632,
"grad_norm": 1419.90625,
"learning_rate": 6.659797723072558e-06,
"loss": 95.183,
"step": 365
},
{
"epoch": 0.14315398342581953,
"grad_norm": 542.6209106445312,
"learning_rate": 6.642987658623581e-06,
"loss": 65.6222,
"step": 366
},
{
"epoch": 0.1435451145280759,
"grad_norm": 1.1538621187210083,
"learning_rate": 6.626156749437736e-06,
"loss": 120.1217,
"step": 367
},
{
"epoch": 0.14393624563033222,
"grad_norm": 398.84796142578125,
"learning_rate": 6.609305209050332e-06,
"loss": 74.3819,
"step": 368
},
{
"epoch": 0.14432737673258855,
"grad_norm": 268.26251220703125,
"learning_rate": 6.592433251258423e-06,
"loss": 76.2617,
"step": 369
},
{
"epoch": 0.14471850783484488,
"grad_norm": 321.3377990722656,
"learning_rate": 6.575541090118105e-06,
"loss": 40.9459,
"step": 370
},
{
"epoch": 0.14510963893710124,
"grad_norm": 0.22645658254623413,
"learning_rate": 6.558628939941792e-06,
"loss": 40.5776,
"step": 371
},
{
"epoch": 0.14550077003935757,
"grad_norm": 0.5002371072769165,
"learning_rate": 6.541697015295503e-06,
"loss": 75.5995,
"step": 372
},
{
"epoch": 0.1458919011416139,
"grad_norm": 0.16194933652877808,
"learning_rate": 6.524745530996137e-06,
"loss": 84.202,
"step": 373
},
{
"epoch": 0.14628303224387024,
"grad_norm": 0.37668415904045105,
"learning_rate": 6.507774702108748e-06,
"loss": 83.8723,
"step": 374
},
{
"epoch": 0.14667416334612657,
"grad_norm": 288.6870422363281,
"learning_rate": 6.490784743943819e-06,
"loss": 128.4052,
"step": 375
},
{
"epoch": 0.14706529444838293,
"grad_norm": 0.13677971065044403,
"learning_rate": 6.473775872054522e-06,
"loss": 57.1975,
"step": 376
},
{
"epoch": 0.14745642555063926,
"grad_norm": 0.3166632652282715,
"learning_rate": 6.456748302233995e-06,
"loss": 48.1106,
"step": 377
},
{
"epoch": 0.1478475566528956,
"grad_norm": 1.3575718402862549,
"learning_rate": 6.439702250512596e-06,
"loss": 129.3716,
"step": 378
},
{
"epoch": 0.14823868775515192,
"grad_norm": 0.21969862282276154,
"learning_rate": 6.4226379331551625e-06,
"loss": 113.5939,
"step": 379
},
{
"epoch": 0.14862981885740828,
"grad_norm": 0.13174912333488464,
"learning_rate": 6.405555566658276e-06,
"loss": 32.1963,
"step": 380
},
{
"epoch": 0.1490209499596646,
"grad_norm": 0.863881528377533,
"learning_rate": 6.388455367747503e-06,
"loss": 29.8159,
"step": 381
},
{
"epoch": 0.14941208106192094,
"grad_norm": 0.6778990030288696,
"learning_rate": 6.3713375533746525e-06,
"loss": 128.0998,
"step": 382
},
{
"epoch": 0.14980321216417727,
"grad_norm": 1.4469739198684692,
"learning_rate": 6.354202340715027e-06,
"loss": 52.4234,
"step": 383
},
{
"epoch": 0.1501943432664336,
"grad_norm": 0.18584023416042328,
"learning_rate": 6.337049947164656e-06,
"loss": 70.7733,
"step": 384
},
{
"epoch": 0.15058547436868996,
"grad_norm": 2002.4371337890625,
"learning_rate": 6.319880590337549e-06,
"loss": 84.5797,
"step": 385
},
{
"epoch": 0.1509766054709463,
"grad_norm": 0.19816723465919495,
"learning_rate": 6.302694488062931e-06,
"loss": 76.0305,
"step": 386
},
{
"epoch": 0.15136773657320263,
"grad_norm": 0.5410847067832947,
"learning_rate": 6.2854918583824745e-06,
"loss": 74.2372,
"step": 387
},
{
"epoch": 0.15175886767545896,
"grad_norm": 1.376298427581787,
"learning_rate": 6.268272919547537e-06,
"loss": 24.7493,
"step": 388
},
{
"epoch": 0.15214999877771532,
"grad_norm": 142.1651611328125,
"learning_rate": 6.251037890016396e-06,
"loss": 34.4196,
"step": 389
},
{
"epoch": 0.15254112987997165,
"grad_norm": 2054.256103515625,
"learning_rate": 6.233786988451468e-06,
"loss": 86.9907,
"step": 390
},
{
"epoch": 0.15293226098222798,
"grad_norm": 3574.6181640625,
"learning_rate": 6.216520433716544e-06,
"loss": 162.4229,
"step": 391
},
{
"epoch": 0.1533233920844843,
"grad_norm": 1714.3194580078125,
"learning_rate": 6.199238444874005e-06,
"loss": 124.9201,
"step": 392
},
{
"epoch": 0.15371452318674064,
"grad_norm": 1639.366455078125,
"learning_rate": 6.181941241182044e-06,
"loss": 76.7506,
"step": 393
},
{
"epoch": 0.154105654288997,
"grad_norm": 1059.76123046875,
"learning_rate": 6.164629042091894e-06,
"loss": 34.25,
"step": 394
},
{
"epoch": 0.15449678539125333,
"grad_norm": 1.3340238332748413,
"learning_rate": 6.1473020672450275e-06,
"loss": 85.7469,
"step": 395
},
{
"epoch": 0.15488791649350966,
"grad_norm": 1639.532958984375,
"learning_rate": 6.1299605364703826e-06,
"loss": 93.6497,
"step": 396
},
{
"epoch": 0.155279047595766,
"grad_norm": 0.20548588037490845,
"learning_rate": 6.112604669781572e-06,
"loss": 83.9102,
"step": 397
},
{
"epoch": 0.15567017869802235,
"grad_norm": 1.0026606321334839,
"learning_rate": 6.095234687374085e-06,
"loss": 74.3395,
"step": 398
},
{
"epoch": 0.15606130980027869,
"grad_norm": 453.576171875,
"learning_rate": 6.0778508096224985e-06,
"loss": 57.6422,
"step": 399
},
{
"epoch": 0.15645244090253502,
"grad_norm": 0.14913234114646912,
"learning_rate": 6.060453257077686e-06,
"loss": 26.2558,
"step": 400
},
{
"epoch": 0.15684357200479135,
"grad_norm": 0.1819409877061844,
"learning_rate": 6.043042250464005e-06,
"loss": 57.1572,
"step": 401
},
{
"epoch": 0.15723470310704768,
"grad_norm": 0.3527912497520447,
"learning_rate": 6.025618010676516e-06,
"loss": 78.8062,
"step": 402
},
{
"epoch": 0.15762583420930404,
"grad_norm": 1.7945302724838257,
"learning_rate": 6.008180758778167e-06,
"loss": 34.1183,
"step": 403
},
{
"epoch": 0.15801696531156037,
"grad_norm": 0.3905615508556366,
"learning_rate": 5.990730715996989e-06,
"loss": 38.1853,
"step": 404
},
{
"epoch": 0.1584080964138167,
"grad_norm": 1662.2044677734375,
"learning_rate": 5.973268103723293e-06,
"loss": 90.833,
"step": 405
},
{
"epoch": 0.15879922751607303,
"grad_norm": 894.9415283203125,
"learning_rate": 5.955793143506863e-06,
"loss": 54.7208,
"step": 406
},
{
"epoch": 0.1591903586183294,
"grad_norm": 0.2516638934612274,
"learning_rate": 5.938306057054139e-06,
"loss": 49.8002,
"step": 407
},
{
"epoch": 0.15958148972058572,
"grad_norm": 0.16948434710502625,
"learning_rate": 5.920807066225409e-06,
"loss": 119.8379,
"step": 408
},
{
"epoch": 0.15997262082284205,
"grad_norm": 0.480121374130249,
"learning_rate": 5.903296393031996e-06,
"loss": 57.558,
"step": 409
},
{
"epoch": 0.16036375192509839,
"grad_norm": 467.2088623046875,
"learning_rate": 5.885774259633432e-06,
"loss": 108.7793,
"step": 410
},
{
"epoch": 0.16075488302735474,
"grad_norm": 0.16975107789039612,
"learning_rate": 5.8682408883346535e-06,
"loss": 23.7654,
"step": 411
},
{
"epoch": 0.16114601412961108,
"grad_norm": 1.1074670553207397,
"learning_rate": 5.850696501583164e-06,
"loss": 35.2543,
"step": 412
},
{
"epoch": 0.1615371452318674,
"grad_norm": 2842.92529296875,
"learning_rate": 5.8331413219662295e-06,
"loss": 78.9185,
"step": 413
},
{
"epoch": 0.16192827633412374,
"grad_norm": 0.22508656978607178,
"learning_rate": 5.815575572208042e-06,
"loss": 25.2656,
"step": 414
},
{
"epoch": 0.16231940743638007,
"grad_norm": 0.14979542791843414,
"learning_rate": 5.797999475166897e-06,
"loss": 43.0969,
"step": 415
},
{
"epoch": 0.16271053853863643,
"grad_norm": 1.3583056926727295,
"learning_rate": 5.78041325383237e-06,
"loss": 111.1471,
"step": 416
},
{
"epoch": 0.16310166964089276,
"grad_norm": 0.3384321630001068,
"learning_rate": 5.762817131322482e-06,
"loss": 168.3297,
"step": 417
},
{
"epoch": 0.1634928007431491,
"grad_norm": 0.4203161895275116,
"learning_rate": 5.745211330880872e-06,
"loss": 227.7544,
"step": 418
},
{
"epoch": 0.16388393184540542,
"grad_norm": 0.2434893250465393,
"learning_rate": 5.7275960758739655e-06,
"loss": 188.506,
"step": 419
},
{
"epoch": 0.16427506294766178,
"grad_norm": 374.4583740234375,
"learning_rate": 5.709971589788136e-06,
"loss": 103.9844,
"step": 420
},
{
"epoch": 0.1646661940499181,
"grad_norm": 0.3567257523536682,
"learning_rate": 5.69233809622687e-06,
"loss": 45.0813,
"step": 421
},
{
"epoch": 0.16505732515217444,
"grad_norm": 0.3658342659473419,
"learning_rate": 5.674695818907943e-06,
"loss": 79.1331,
"step": 422
},
{
"epoch": 0.16544845625443078,
"grad_norm": 1.0847684144973755,
"learning_rate": 5.65704498166056e-06,
"loss": 89.7686,
"step": 423
},
{
"epoch": 0.1658395873566871,
"grad_norm": 1.6579034328460693,
"learning_rate": 5.6393858084225305e-06,
"loss": 36.3624,
"step": 424
},
{
"epoch": 0.16623071845894347,
"grad_norm": 136.94493103027344,
"learning_rate": 5.621718523237427e-06,
"loss": 86.6906,
"step": 425
},
{
"epoch": 0.1666218495611998,
"grad_norm": 1437.6072998046875,
"learning_rate": 5.604043350251733e-06,
"loss": 47.5283,
"step": 426
},
{
"epoch": 0.16701298066345613,
"grad_norm": 1.1629807949066162,
"learning_rate": 5.586360513712011e-06,
"loss": 69.8016,
"step": 427
},
{
"epoch": 0.16740411176571246,
"grad_norm": 0.08627960830926895,
"learning_rate": 5.568670237962045e-06,
"loss": 41.2346,
"step": 428
},
{
"epoch": 0.16779524286796882,
"grad_norm": 307.7284851074219,
"learning_rate": 5.550972747440007e-06,
"loss": 64.6287,
"step": 429
},
{
"epoch": 0.16818637397022515,
"grad_norm": 275.1657409667969,
"learning_rate": 5.533268266675601e-06,
"loss": 37.8621,
"step": 430
},
{
"epoch": 0.16857750507248148,
"grad_norm": 321.581298828125,
"learning_rate": 5.515557020287219e-06,
"loss": 21.3013,
"step": 431
},
{
"epoch": 0.1689686361747378,
"grad_norm": 107.34349822998047,
"learning_rate": 5.497839232979084e-06,
"loss": 22.1243,
"step": 432
},
{
"epoch": 0.16935976727699414,
"grad_norm": 1374.56396484375,
"learning_rate": 5.480115129538409e-06,
"loss": 36.583,
"step": 433
},
{
"epoch": 0.1697508983792505,
"grad_norm": 0.42376813292503357,
"learning_rate": 5.4623849348325396e-06,
"loss": 44.8171,
"step": 434
},
{
"epoch": 0.17014202948150683,
"grad_norm": 1.2415663003921509,
"learning_rate": 5.444648873806101e-06,
"loss": 61.8365,
"step": 435
},
{
"epoch": 0.17053316058376317,
"grad_norm": 0.8514006733894348,
"learning_rate": 5.426907171478143e-06,
"loss": 46.8816,
"step": 436
},
{
"epoch": 0.1709242916860195,
"grad_norm": 0.2142760455608368,
"learning_rate": 5.409160052939292e-06,
"loss": 35.3242,
"step": 437
},
{
"epoch": 0.17131542278827586,
"grad_norm": 0.15164269506931305,
"learning_rate": 5.391407743348884e-06,
"loss": 11.6772,
"step": 438
},
{
"epoch": 0.1717065538905322,
"grad_norm": 1661.5631103515625,
"learning_rate": 5.373650467932122e-06,
"loss": 53.8261,
"step": 439
},
{
"epoch": 0.17209768499278852,
"grad_norm": 1743.482421875,
"learning_rate": 5.355888451977204e-06,
"loss": 35.92,
"step": 440
},
{
"epoch": 0.17248881609504485,
"grad_norm": 0.20529299974441528,
"learning_rate": 5.3381219208324755e-06,
"loss": 33.3962,
"step": 441
},
{
"epoch": 0.1728799471973012,
"grad_norm": 2.2511260509490967,
"learning_rate": 5.320351099903565e-06,
"loss": 55.9249,
"step": 442
},
{
"epoch": 0.17327107829955754,
"grad_norm": 0.11457404494285583,
"learning_rate": 5.302576214650527e-06,
"loss": 73.4981,
"step": 443
},
{
"epoch": 0.17366220940181387,
"grad_norm": 138.77809143066406,
"learning_rate": 5.284797490584979e-06,
"loss": 41.5857,
"step": 444
},
{
"epoch": 0.1740533405040702,
"grad_norm": 259.2716064453125,
"learning_rate": 5.267015153267246e-06,
"loss": 63.9765,
"step": 445
},
{
"epoch": 0.17444447160632653,
"grad_norm": 1.0777804851531982,
"learning_rate": 5.249229428303486e-06,
"loss": 11.9817,
"step": 446
},
{
"epoch": 0.1748356027085829,
"grad_norm": 2536.879150390625,
"learning_rate": 5.231440541342846e-06,
"loss": 57.9686,
"step": 447
},
{
"epoch": 0.17522673381083922,
"grad_norm": 0.2679949700832367,
"learning_rate": 5.213648718074584e-06,
"loss": 31.2022,
"step": 448
},
{
"epoch": 0.17561786491309556,
"grad_norm": 0.14789415895938873,
"learning_rate": 5.1958541842252145e-06,
"loss": 46.2886,
"step": 449
},
{
"epoch": 0.1760089960153519,
"grad_norm": 3.2639567852020264,
"learning_rate": 5.178057165555636e-06,
"loss": 84.1918,
"step": 450
},
{
"epoch": 0.17640012711760825,
"grad_norm": 372.91387939453125,
"learning_rate": 5.160257887858278e-06,
"loss": 20.9396,
"step": 451
},
{
"epoch": 0.17679125821986458,
"grad_norm": 0.30689093470573425,
"learning_rate": 5.142456576954225e-06,
"loss": 15.7738,
"step": 452
},
{
"epoch": 0.1771823893221209,
"grad_norm": 247.07366943359375,
"learning_rate": 5.1246534586903655e-06,
"loss": 19.9637,
"step": 453
},
{
"epoch": 0.17757352042437724,
"grad_norm": 0.7007215619087219,
"learning_rate": 5.106848758936508e-06,
"loss": 33.0597,
"step": 454
},
{
"epoch": 0.17796465152663357,
"grad_norm": 290.64202880859375,
"learning_rate": 5.089042703582533e-06,
"loss": 65.3849,
"step": 455
},
{
"epoch": 0.17835578262888993,
"grad_norm": 0.4069232940673828,
"learning_rate": 5.071235518535516e-06,
"loss": 25.4287,
"step": 456
},
{
"epoch": 0.17874691373114626,
"grad_norm": 0.6326934099197388,
"learning_rate": 5.053427429716867e-06,
"loss": 43.9984,
"step": 457
},
{
"epoch": 0.1791380448334026,
"grad_norm": 0.269971638917923,
"learning_rate": 5.0356186630594585e-06,
"loss": 46.612,
"step": 458
},
{
"epoch": 0.17952917593565892,
"grad_norm": 0.21015766263008118,
"learning_rate": 5.017809444504768e-06,
"loss": 48.1048,
"step": 459
},
{
"epoch": 0.17992030703791528,
"grad_norm": 357.07867431640625,
"learning_rate": 5e-06,
"loss": 25.4451,
"step": 460
},
{
"epoch": 0.18031143814017161,
"grad_norm": 476.7673645019531,
"learning_rate": 4.982190555495236e-06,
"loss": 35.3956,
"step": 461
},
{
"epoch": 0.18070256924242795,
"grad_norm": 0.19846689701080322,
"learning_rate": 4.964381336940542e-06,
"loss": 76.3696,
"step": 462
},
{
"epoch": 0.18109370034468428,
"grad_norm": 0.3459748923778534,
"learning_rate": 4.946572570283135e-06,
"loss": 28.662,
"step": 463
},
{
"epoch": 0.1814848314469406,
"grad_norm": 2.347224712371826,
"learning_rate": 4.928764481464485e-06,
"loss": 43.5744,
"step": 464
},
{
"epoch": 0.18187596254919697,
"grad_norm": 1.0240310430526733,
"learning_rate": 4.910957296417467e-06,
"loss": 38.6815,
"step": 465
},
{
"epoch": 0.1822670936514533,
"grad_norm": 2363.06884765625,
"learning_rate": 4.893151241063493e-06,
"loss": 97.8371,
"step": 466
},
{
"epoch": 0.18265822475370963,
"grad_norm": 0.7521228790283203,
"learning_rate": 4.875346541309637e-06,
"loss": 87.3877,
"step": 467
},
{
"epoch": 0.18304935585596596,
"grad_norm": 186.02557373046875,
"learning_rate": 4.857543423045775e-06,
"loss": 61.2668,
"step": 468
},
{
"epoch": 0.18344048695822232,
"grad_norm": 0.3884137272834778,
"learning_rate": 4.839742112141725e-06,
"loss": 57.2652,
"step": 469
},
{
"epoch": 0.18383161806047865,
"grad_norm": 1904.343994140625,
"learning_rate": 4.821942834444367e-06,
"loss": 44.9092,
"step": 470
},
{
"epoch": 0.18422274916273498,
"grad_norm": 0.933085024356842,
"learning_rate": 4.804145815774787e-06,
"loss": 45.1724,
"step": 471
},
{
"epoch": 0.18461388026499131,
"grad_norm": 0.17478938400745392,
"learning_rate": 4.786351281925417e-06,
"loss": 63.2364,
"step": 472
},
{
"epoch": 0.18500501136724765,
"grad_norm": 0.3920159637928009,
"learning_rate": 4.768559458657156e-06,
"loss": 9.0603,
"step": 473
},
{
"epoch": 0.185396142469504,
"grad_norm": 0.20027987658977509,
"learning_rate": 4.750770571696514e-06,
"loss": 67.3941,
"step": 474
},
{
"epoch": 0.18578727357176034,
"grad_norm": 4896.18603515625,
"learning_rate": 4.732984846732755e-06,
"loss": 110.2459,
"step": 475
},
{
"epoch": 0.18617840467401667,
"grad_norm": 0.6982368230819702,
"learning_rate": 4.7152025094150214e-06,
"loss": 60.1346,
"step": 476
},
{
"epoch": 0.186569535776273,
"grad_norm": 0.21346724033355713,
"learning_rate": 4.697423785349475e-06,
"loss": 86.276,
"step": 477
},
{
"epoch": 0.18696066687852936,
"grad_norm": 0.5096214413642883,
"learning_rate": 4.679648900096436e-06,
"loss": 47.5857,
"step": 478
},
{
"epoch": 0.1873517979807857,
"grad_norm": 0.5563175678253174,
"learning_rate": 4.661878079167527e-06,
"loss": 49.3372,
"step": 479
},
{
"epoch": 0.18774292908304202,
"grad_norm": 1.7563621997833252,
"learning_rate": 4.644111548022798e-06,
"loss": 40.9886,
"step": 480
},
{
"epoch": 0.18813406018529835,
"grad_norm": 112.32270050048828,
"learning_rate": 4.626349532067879e-06,
"loss": 47.8111,
"step": 481
},
{
"epoch": 0.1885251912875547,
"grad_norm": 1.5125802755355835,
"learning_rate": 4.608592256651117e-06,
"loss": 32.0425,
"step": 482
},
{
"epoch": 0.18891632238981104,
"grad_norm": 0.3377935588359833,
"learning_rate": 4.5908399470607106e-06,
"loss": 41.984,
"step": 483
},
{
"epoch": 0.18930745349206737,
"grad_norm": 262.73553466796875,
"learning_rate": 4.573092828521857e-06,
"loss": 15.6826,
"step": 484
},
{
"epoch": 0.1896985845943237,
"grad_norm": 1222.323974609375,
"learning_rate": 4.555351126193901e-06,
"loss": 82.0725,
"step": 485
},
{
"epoch": 0.19008971569658004,
"grad_norm": 0.4867708384990692,
"learning_rate": 4.537615065167461e-06,
"loss": 68.1378,
"step": 486
},
{
"epoch": 0.1904808467988364,
"grad_norm": 0.24868200719356537,
"learning_rate": 4.5198848704615915e-06,
"loss": 34.2792,
"step": 487
},
{
"epoch": 0.19087197790109273,
"grad_norm": 1565.045654296875,
"learning_rate": 4.502160767020918e-06,
"loss": 61.3013,
"step": 488
},
{
"epoch": 0.19126310900334906,
"grad_norm": 0.12846969068050385,
"learning_rate": 4.484442979712783e-06,
"loss": 101.128,
"step": 489
},
{
"epoch": 0.1916542401056054,
"grad_norm": 0.2493712157011032,
"learning_rate": 4.466731733324399e-06,
"loss": 16.2675,
"step": 490
},
{
"epoch": 0.19204537120786175,
"grad_norm": 0.8139014840126038,
"learning_rate": 4.449027252559994e-06,
"loss": 56.7364,
"step": 491
},
{
"epoch": 0.19243650231011808,
"grad_norm": 0.44511955976486206,
"learning_rate": 4.431329762037958e-06,
"loss": 22.7369,
"step": 492
},
{
"epoch": 0.1928276334123744,
"grad_norm": 334.3827819824219,
"learning_rate": 4.413639486287992e-06,
"loss": 8.3813,
"step": 493
},
{
"epoch": 0.19321876451463074,
"grad_norm": 1712.8160400390625,
"learning_rate": 4.395956649748269e-06,
"loss": 57.2132,
"step": 494
},
{
"epoch": 0.19360989561688707,
"grad_norm": 1017.40771484375,
"learning_rate": 4.3782814767625755e-06,
"loss": 39.137,
"step": 495
},
{
"epoch": 0.19400102671914343,
"grad_norm": 2.061763286590576,
"learning_rate": 4.3606141915774695e-06,
"loss": 29.4071,
"step": 496
},
{
"epoch": 0.19439215782139976,
"grad_norm": 0.351924329996109,
"learning_rate": 4.342955018339442e-06,
"loss": 71.3075,
"step": 497
},
{
"epoch": 0.1947832889236561,
"grad_norm": 0.5345131754875183,
"learning_rate": 4.3253041810920595e-06,
"loss": 53.7062,
"step": 498
},
{
"epoch": 0.19517442002591243,
"grad_norm": 0.4997957944869995,
"learning_rate": 4.307661903773129e-06,
"loss": 52.0548,
"step": 499
},
{
"epoch": 0.19556555112816879,
"grad_norm": 0.646457850933075,
"learning_rate": 4.290028410211866e-06,
"loss": 92.2717,
"step": 500
},
{
"epoch": 0.19595668223042512,
"grad_norm": 0.3356407880783081,
"learning_rate": 4.272403924126035e-06,
"loss": 89.8288,
"step": 501
},
{
"epoch": 0.19634781333268145,
"grad_norm": 1.2155108451843262,
"learning_rate": 4.254788669119127e-06,
"loss": 42.697,
"step": 502
},
{
"epoch": 0.19673894443493778,
"grad_norm": 0.8059905171394348,
"learning_rate": 4.237182868677519e-06,
"loss": 45.4195,
"step": 503
},
{
"epoch": 0.1971300755371941,
"grad_norm": 0.19161798059940338,
"learning_rate": 4.219586746167632e-06,
"loss": 5.6987,
"step": 504
},
{
"epoch": 0.19752120663945047,
"grad_norm": 1.4473623037338257,
"learning_rate": 4.2020005248331056e-06,
"loss": 83.2729,
"step": 505
},
{
"epoch": 0.1979123377417068,
"grad_norm": 3267.076904296875,
"learning_rate": 4.18442442779196e-06,
"loss": 64.1532,
"step": 506
},
{
"epoch": 0.19830346884396313,
"grad_norm": 0.3686378002166748,
"learning_rate": 4.166858678033771e-06,
"loss": 5.8373,
"step": 507
},
{
"epoch": 0.19869459994621946,
"grad_norm": 0.7481865882873535,
"learning_rate": 4.149303498416838e-06,
"loss": 30.1228,
"step": 508
},
{
"epoch": 0.19908573104847582,
"grad_norm": 1.0720148086547852,
"learning_rate": 4.131759111665349e-06,
"loss": 16.1707,
"step": 509
},
{
"epoch": 0.19947686215073215,
"grad_norm": 253.39622497558594,
"learning_rate": 4.114225740366569e-06,
"loss": 11.6924,
"step": 510
},
{
"epoch": 0.19986799325298849,
"grad_norm": 1.3426392078399658,
"learning_rate": 4.096703606968007e-06,
"loss": 18.6361,
"step": 511
},
{
"epoch": 0.20025912435524482,
"grad_norm": 242.68768310546875,
"learning_rate": 4.079192933774592e-06,
"loss": 60.1703,
"step": 512
},
{
"epoch": 0.20065025545750115,
"grad_norm": 0.12895415723323822,
"learning_rate": 4.061693942945863e-06,
"loss": 39.873,
"step": 513
},
{
"epoch": 0.2010413865597575,
"grad_norm": 446.8625793457031,
"learning_rate": 4.04420685649314e-06,
"loss": 61.0251,
"step": 514
},
{
"epoch": 0.20143251766201384,
"grad_norm": 0.19806069135665894,
"learning_rate": 4.026731896276708e-06,
"loss": 45.4137,
"step": 515
},
{
"epoch": 0.20182364876427017,
"grad_norm": 0.357597678899765,
"learning_rate": 4.009269284003014e-06,
"loss": 110.9987,
"step": 516
},
{
"epoch": 0.2022147798665265,
"grad_norm": 0.15286563336849213,
"learning_rate": 3.991819241221836e-06,
"loss": 52.1221,
"step": 517
},
{
"epoch": 0.20260591096878286,
"grad_norm": 0.1778210997581482,
"learning_rate": 3.974381989323484e-06,
"loss": 57.3214,
"step": 518
},
{
"epoch": 0.2029970420710392,
"grad_norm": 1.1525286436080933,
"learning_rate": 3.956957749535997e-06,
"loss": 26.8482,
"step": 519
},
{
"epoch": 0.20338817317329552,
"grad_norm": 1.2070550918579102,
"learning_rate": 3.939546742922318e-06,
"loss": 68.4404,
"step": 520
},
{
"epoch": 0.20377930427555185,
"grad_norm": 1.1822954416275024,
"learning_rate": 3.9221491903775014e-06,
"loss": 20.8915,
"step": 521
},
{
"epoch": 0.2041704353778082,
"grad_norm": 0.27339041233062744,
"learning_rate": 3.904765312625916e-06,
"loss": 38.3124,
"step": 522
},
{
"epoch": 0.20456156648006454,
"grad_norm": 251.54371643066406,
"learning_rate": 3.887395330218429e-06,
"loss": 75.3845,
"step": 523
},
{
"epoch": 0.20495269758232088,
"grad_norm": 0.2657800316810608,
"learning_rate": 3.8700394635296166e-06,
"loss": 50.7794,
"step": 524
},
{
"epoch": 0.2053438286845772,
"grad_norm": 0.9252436757087708,
"learning_rate": 3.852697932754974e-06,
"loss": 80.2724,
"step": 525
},
{
"epoch": 0.20573495978683354,
"grad_norm": 0.18160250782966614,
"learning_rate": 3.835370957908108e-06,
"loss": 18.6159,
"step": 526
},
{
"epoch": 0.2061260908890899,
"grad_norm": 1.437393069267273,
"learning_rate": 3.818058758817956e-06,
"loss": 62.4316,
"step": 527
},
{
"epoch": 0.20651722199134623,
"grad_norm": 2809.828369140625,
"learning_rate": 3.800761555125997e-06,
"loss": 25.8665,
"step": 528
},
{
"epoch": 0.20690835309360256,
"grad_norm": 952.0728149414062,
"learning_rate": 3.783479566283457e-06,
"loss": 22.2021,
"step": 529
},
{
"epoch": 0.2072994841958589,
"grad_norm": 2.9290764331817627,
"learning_rate": 3.7662130115485317e-06,
"loss": 62.186,
"step": 530
},
{
"epoch": 0.20769061529811525,
"grad_norm": 0.4081960618495941,
"learning_rate": 3.748962109983605e-06,
"loss": 36.3092,
"step": 531
},
{
"epoch": 0.20808174640037158,
"grad_norm": 0.33659136295318604,
"learning_rate": 3.731727080452464e-06,
"loss": 19.2433,
"step": 532
},
{
"epoch": 0.2084728775026279,
"grad_norm": 1739.4482421875,
"learning_rate": 3.714508141617527e-06,
"loss": 76.853,
"step": 533
},
{
"epoch": 0.20886400860488424,
"grad_norm": 3326.531494140625,
"learning_rate": 3.69730551193707e-06,
"loss": 99.9301,
"step": 534
},
{
"epoch": 0.20925513970714057,
"grad_norm": 220.89852905273438,
"learning_rate": 3.6801194096624515e-06,
"loss": 41.4909,
"step": 535
},
{
"epoch": 0.20964627080939693,
"grad_norm": 106.39971160888672,
"learning_rate": 3.6629500528353464e-06,
"loss": 28.8744,
"step": 536
},
{
"epoch": 0.21003740191165327,
"grad_norm": 1.0691938400268555,
"learning_rate": 3.6457976592849753e-06,
"loss": 20.5078,
"step": 537
},
{
"epoch": 0.2104285330139096,
"grad_norm": 0.6214406490325928,
"learning_rate": 3.6286624466253496e-06,
"loss": 45.2672,
"step": 538
},
{
"epoch": 0.21081966411616593,
"grad_norm": 1.1435602903366089,
"learning_rate": 3.6115446322525007e-06,
"loss": 62.7016,
"step": 539
},
{
"epoch": 0.2112107952184223,
"grad_norm": 0.2811889052391052,
"learning_rate": 3.594444433341725e-06,
"loss": 73.1093,
"step": 540
},
{
"epoch": 0.21160192632067862,
"grad_norm": 1629.179443359375,
"learning_rate": 3.5773620668448384e-06,
"loss": 70.3456,
"step": 541
},
{
"epoch": 0.21199305742293495,
"grad_norm": 0.6402444839477539,
"learning_rate": 3.560297749487407e-06,
"loss": 100.5996,
"step": 542
},
{
"epoch": 0.21238418852519128,
"grad_norm": 0.20082825422286987,
"learning_rate": 3.543251697766006e-06,
"loss": 45.9456,
"step": 543
},
{
"epoch": 0.2127753196274476,
"grad_norm": 0.2259850949048996,
"learning_rate": 3.526224127945479e-06,
"loss": 81.2139,
"step": 544
},
{
"epoch": 0.21316645072970397,
"grad_norm": 0.5259697437286377,
"learning_rate": 3.5092152560561833e-06,
"loss": 29.0951,
"step": 545
},
{
"epoch": 0.2135575818319603,
"grad_norm": 0.2201029360294342,
"learning_rate": 3.4922252978912523e-06,
"loss": 39.3512,
"step": 546
},
{
"epoch": 0.21394871293421663,
"grad_norm": 0.6921319365501404,
"learning_rate": 3.475254469003865e-06,
"loss": 74.4222,
"step": 547
},
{
"epoch": 0.21433984403647297,
"grad_norm": 0.3210639953613281,
"learning_rate": 3.4583029847044996e-06,
"loss": 78.7619,
"step": 548
},
{
"epoch": 0.21473097513872932,
"grad_norm": 0.49791640043258667,
"learning_rate": 3.4413710600582096e-06,
"loss": 41.5078,
"step": 549
},
{
"epoch": 0.21512210624098566,
"grad_norm": 1.5678467750549316,
"learning_rate": 3.424458909881897e-06,
"loss": 25.0795,
"step": 550
},
{
"epoch": 0.215513237343242,
"grad_norm": 0.4146921634674072,
"learning_rate": 3.4075667487415785e-06,
"loss": 53.2567,
"step": 551
},
{
"epoch": 0.21590436844549832,
"grad_norm": 0.3883844017982483,
"learning_rate": 3.3906947909496696e-06,
"loss": 40.2185,
"step": 552
},
{
"epoch": 0.21629549954775465,
"grad_norm": 1.1996833086013794,
"learning_rate": 3.3738432505622653e-06,
"loss": 43.9982,
"step": 553
},
{
"epoch": 0.216686630650011,
"grad_norm": 0.203975647687912,
"learning_rate": 3.357012341376421e-06,
"loss": 29.5247,
"step": 554
},
{
"epoch": 0.21707776175226734,
"grad_norm": 0.3972192108631134,
"learning_rate": 3.3402022769274422e-06,
"loss": 12.6732,
"step": 555
},
{
"epoch": 0.21746889285452367,
"grad_norm": 0.3920489251613617,
"learning_rate": 3.3234132704861786e-06,
"loss": 10.7088,
"step": 556
},
{
"epoch": 0.21786002395678,
"grad_norm": 1.8879612684249878,
"learning_rate": 3.306645535056312e-06,
"loss": 78.3652,
"step": 557
},
{
"epoch": 0.21825115505903636,
"grad_norm": 1339.1312255859375,
"learning_rate": 3.289899283371657e-06,
"loss": 115.3514,
"step": 558
},
{
"epoch": 0.2186422861612927,
"grad_norm": 0.34053364396095276,
"learning_rate": 3.273174727893463e-06,
"loss": 30.0332,
"step": 559
},
{
"epoch": 0.21903341726354902,
"grad_norm": 1.0308630466461182,
"learning_rate": 3.2564720808077167e-06,
"loss": 45.353,
"step": 560
},
{
"epoch": 0.21942454836580536,
"grad_norm": 0.11198209971189499,
"learning_rate": 3.2397915540224493e-06,
"loss": 21.2855,
"step": 561
},
{
"epoch": 0.21981567946806171,
"grad_norm": 0.1384800672531128,
"learning_rate": 3.2231333591650567e-06,
"loss": 1.2242,
"step": 562
},
{
"epoch": 0.22020681057031805,
"grad_norm": 0.2989153563976288,
"learning_rate": 3.2064977075795988e-06,
"loss": 25.7958,
"step": 563
},
{
"epoch": 0.22059794167257438,
"grad_norm": 1617.5919189453125,
"learning_rate": 3.189884810324133e-06,
"loss": 41.6185,
"step": 564
},
{
"epoch": 0.2209890727748307,
"grad_norm": 0.4813622236251831,
"learning_rate": 3.173294878168025e-06,
"loss": 54.3557,
"step": 565
},
{
"epoch": 0.22138020387708704,
"grad_norm": 207.3912811279297,
"learning_rate": 3.1567281215892868e-06,
"loss": 23.3589,
"step": 566
},
{
"epoch": 0.2217713349793434,
"grad_norm": 0.6962729096412659,
"learning_rate": 3.140184750771895e-06,
"loss": 53.3044,
"step": 567
},
{
"epoch": 0.22216246608159973,
"grad_norm": 0.3524315059185028,
"learning_rate": 3.12366497560313e-06,
"loss": 44.0667,
"step": 568
},
{
"epoch": 0.22255359718385606,
"grad_norm": 0.2957899868488312,
"learning_rate": 3.1071690056709125e-06,
"loss": 61.0361,
"step": 569
},
{
"epoch": 0.2229447282861124,
"grad_norm": 0.13717715442180634,
"learning_rate": 3.090697050261143e-06,
"loss": 48.0218,
"step": 570
},
{
"epoch": 0.22333585938836875,
"grad_norm": 1.4488298892974854,
"learning_rate": 3.074249318355046e-06,
"loss": 42.8739,
"step": 571
},
{
"epoch": 0.22372699049062508,
"grad_norm": 0.19119593501091003,
"learning_rate": 3.057826018626527e-06,
"loss": 131.6781,
"step": 572
},
{
"epoch": 0.22411812159288141,
"grad_norm": 0.4780034124851227,
"learning_rate": 3.0414273594395106e-06,
"loss": 68.4404,
"step": 573
},
{
"epoch": 0.22450925269513775,
"grad_norm": 0.25417017936706543,
"learning_rate": 3.0250535488453077e-06,
"loss": 78.3688,
"step": 574
},
{
"epoch": 0.22490038379739408,
"grad_norm": 0.2417258769273758,
"learning_rate": 3.008704794579973e-06,
"loss": 116.528,
"step": 575
},
{
"epoch": 0.22529151489965044,
"grad_norm": 0.19122564792633057,
"learning_rate": 2.9923813040616685e-06,
"loss": 29.6101,
"step": 576
},
{
"epoch": 0.22568264600190677,
"grad_norm": 0.13005802035331726,
"learning_rate": 2.976083284388031e-06,
"loss": 35.3624,
"step": 577
},
{
"epoch": 0.2260737771041631,
"grad_norm": 381.5791320800781,
"learning_rate": 2.959810942333552e-06,
"loss": 90.369,
"step": 578
},
{
"epoch": 0.22646490820641943,
"grad_norm": 0.19559843838214874,
"learning_rate": 2.9435644843469434e-06,
"loss": 51.9091,
"step": 579
},
{
"epoch": 0.2268560393086758,
"grad_norm": 0.3959593176841736,
"learning_rate": 2.9273441165485227e-06,
"loss": 38.9128,
"step": 580
},
{
"epoch": 0.22724717041093212,
"grad_norm": 0.18184183537960052,
"learning_rate": 2.9111500447276053e-06,
"loss": 51.5855,
"step": 581
},
{
"epoch": 0.22763830151318845,
"grad_norm": 0.6553506851196289,
"learning_rate": 2.8949824743398804e-06,
"loss": 30.2534,
"step": 582
},
{
"epoch": 0.22802943261544478,
"grad_norm": 0.18349502980709076,
"learning_rate": 2.8788416105048124e-06,
"loss": 15.0662,
"step": 583
},
{
"epoch": 0.22842056371770111,
"grad_norm": 0.57335364818573,
"learning_rate": 2.862727658003042e-06,
"loss": 48.4215,
"step": 584
},
{
"epoch": 0.22881169481995747,
"grad_norm": 0.36635109782218933,
"learning_rate": 2.8466408212737777e-06,
"loss": 4.5718,
"step": 585
},
{
"epoch": 0.2292028259222138,
"grad_norm": 0.2556889057159424,
"learning_rate": 2.83058130441221e-06,
"loss": 43.9021,
"step": 586
},
{
"epoch": 0.22959395702447014,
"grad_norm": 1374.17138671875,
"learning_rate": 2.8145493111669186e-06,
"loss": 61.402,
"step": 587
},
{
"epoch": 0.22998508812672647,
"grad_norm": 0.1971081793308258,
"learning_rate": 2.79854504493729e-06,
"loss": 49.1368,
"step": 588
},
{
"epoch": 0.23037621922898283,
"grad_norm": 1701.60693359375,
"learning_rate": 2.782568708770933e-06,
"loss": 53.8848,
"step": 589
},
{
"epoch": 0.23076735033123916,
"grad_norm": 789.6815185546875,
"learning_rate": 2.7666205053611097e-06,
"loss": 17.3218,
"step": 590
},
{
"epoch": 0.2311584814334955,
"grad_norm": 0.25564226508140564,
"learning_rate": 2.7507006370441557e-06,
"loss": 8.3916,
"step": 591
},
{
"epoch": 0.23154961253575182,
"grad_norm": 1354.3128662109375,
"learning_rate": 2.734809305796915e-06,
"loss": 48.9245,
"step": 592
},
{
"epoch": 0.23194074363800815,
"grad_norm": 161.26748657226562,
"learning_rate": 2.718946713234185e-06,
"loss": 56.0846,
"step": 593
},
{
"epoch": 0.2323318747402645,
"grad_norm": 0.32130080461502075,
"learning_rate": 2.7031130606061486e-06,
"loss": 38.3063,
"step": 594
},
{
"epoch": 0.23272300584252084,
"grad_norm": 1.8923166990280151,
"learning_rate": 2.687308548795825e-06,
"loss": 47.5562,
"step": 595
},
{
"epoch": 0.23311413694477717,
"grad_norm": 0.13152983784675598,
"learning_rate": 2.67153337831652e-06,
"loss": 26.3863,
"step": 596
},
{
"epoch": 0.2335052680470335,
"grad_norm": 0.1482762098312378,
"learning_rate": 2.6557877493092885e-06,
"loss": 79.9195,
"step": 597
},
{
"epoch": 0.23389639914928986,
"grad_norm": 0.4109286069869995,
"learning_rate": 2.6400718615403852e-06,
"loss": 50.5231,
"step": 598
},
{
"epoch": 0.2342875302515462,
"grad_norm": 0.8140275478363037,
"learning_rate": 2.624385914398737e-06,
"loss": 55.21,
"step": 599
},
{
"epoch": 0.23467866135380253,
"grad_norm": 1361.185302734375,
"learning_rate": 2.608730106893411e-06,
"loss": 51.1663,
"step": 600
},
{
"epoch": 0.23506979245605886,
"grad_norm": 0.4616069793701172,
"learning_rate": 2.5931046376510875e-06,
"loss": 14.65,
"step": 601
},
{
"epoch": 0.23546092355831522,
"grad_norm": 0.43858256936073303,
"learning_rate": 2.5775097049135445e-06,
"loss": 21.6647,
"step": 602
},
{
"epoch": 0.23585205466057155,
"grad_norm": 0.2774844765663147,
"learning_rate": 2.561945506535144e-06,
"loss": 37.901,
"step": 603
},
{
"epoch": 0.23624318576282788,
"grad_norm": 1338.126220703125,
"learning_rate": 2.5464122399803126e-06,
"loss": 66.4596,
"step": 604
},
{
"epoch": 0.2366343168650842,
"grad_norm": 0.4828522801399231,
"learning_rate": 2.5309101023210426e-06,
"loss": 11.7746,
"step": 605
},
{
"epoch": 0.23702544796734054,
"grad_norm": 1.3893892765045166,
"learning_rate": 2.5154392902343966e-06,
"loss": 38.6192,
"step": 606
},
{
"epoch": 0.2374165790695969,
"grad_norm": 2.981203556060791,
"learning_rate": 2.5000000000000015e-06,
"loss": 15.9302,
"step": 607
},
{
"epoch": 0.23780771017185323,
"grad_norm": 597.11279296875,
"learning_rate": 2.4845924274975625e-06,
"loss": 45.9918,
"step": 608
},
{
"epoch": 0.23819884127410956,
"grad_norm": 1.3516067266464233,
"learning_rate": 2.4692167682043855e-06,
"loss": 46.3228,
"step": 609
},
{
"epoch": 0.2385899723763659,
"grad_norm": 0.1671498566865921,
"learning_rate": 2.4538732171928847e-06,
"loss": 43.2643,
"step": 610
},
{
"epoch": 0.23898110347862225,
"grad_norm": 0.3257511854171753,
"learning_rate": 2.4385619691281144e-06,
"loss": 44.6005,
"step": 611
},
{
"epoch": 0.23937223458087858,
"grad_norm": 0.4877566993236542,
"learning_rate": 2.4232832182653014e-06,
"loss": 72.7035,
"step": 612
},
{
"epoch": 0.23976336568313492,
"grad_norm": 1470.4984130859375,
"learning_rate": 2.408037158447375e-06,
"loss": 50.2886,
"step": 613
},
{
"epoch": 0.24015449678539125,
"grad_norm": 0.78384929895401,
"learning_rate": 2.39282398310251e-06,
"loss": 19.3894,
"step": 614
},
{
"epoch": 0.24054562788764758,
"grad_norm": 0.2225428968667984,
"learning_rate": 2.3776438852416743e-06,
"loss": 53.8646,
"step": 615
},
{
"epoch": 0.24093675898990394,
"grad_norm": 0.996082603931427,
"learning_rate": 2.3624970574561773e-06,
"loss": 2.6708,
"step": 616
},
{
"epoch": 0.24132789009216027,
"grad_norm": 0.38895338773727417,
"learning_rate": 2.3473836919152267e-06,
"loss": 50.9521,
"step": 617
},
{
"epoch": 0.2417190211944166,
"grad_norm": 1.0213991403579712,
"learning_rate": 2.332303980363497e-06,
"loss": 44.1723,
"step": 618
},
{
"epoch": 0.24211015229667293,
"grad_norm": 0.5459339022636414,
"learning_rate": 2.317258114118686e-06,
"loss": 1.9167,
"step": 619
},
{
"epoch": 0.2425012833989293,
"grad_norm": 428.5580139160156,
"learning_rate": 2.3022462840690933e-06,
"loss": 60.3034,
"step": 620
},
{
"epoch": 0.24289241450118562,
"grad_norm": 1972.2655029296875,
"learning_rate": 2.2872686806712037e-06,
"loss": 53.7348,
"step": 621
},
{
"epoch": 0.24328354560344195,
"grad_norm": 0.9528247714042664,
"learning_rate": 2.272325493947257e-06,
"loss": 34.783,
"step": 622
},
{
"epoch": 0.24367467670569828,
"grad_norm": 0.456136554479599,
"learning_rate": 2.257416913482853e-06,
"loss": 32.2854,
"step": 623
},
{
"epoch": 0.24406580780795462,
"grad_norm": 0.22959932684898376,
"learning_rate": 2.2425431284245302e-06,
"loss": 27.3752,
"step": 624
},
{
"epoch": 0.24445693891021097,
"grad_norm": 0.48872700333595276,
"learning_rate": 2.2277043274773856e-06,
"loss": 22.8773,
"step": 625
},
{
"epoch": 0.2448480700124673,
"grad_norm": 1.4068603515625,
"learning_rate": 2.2129006989026612e-06,
"loss": 26.3108,
"step": 626
},
{
"epoch": 0.24523920111472364,
"grad_norm": 2.354417085647583,
"learning_rate": 2.1981324305153644e-06,
"loss": 60.9464,
"step": 627
},
{
"epoch": 0.24563033221697997,
"grad_norm": 1.7362618446350098,
"learning_rate": 2.1833997096818897e-06,
"loss": 18.6702,
"step": 628
},
{
"epoch": 0.24602146331923633,
"grad_norm": 471.421875,
"learning_rate": 2.168702723317632e-06,
"loss": 13.0568,
"step": 629
},
{
"epoch": 0.24641259442149266,
"grad_norm": 0.3530268669128418,
"learning_rate": 2.1540416578846207e-06,
"loss": 14.6524,
"step": 630
},
{
"epoch": 0.246803725523749,
"grad_norm": 0.7190971970558167,
"learning_rate": 2.139416699389153e-06,
"loss": 12.7703,
"step": 631
},
{
"epoch": 0.24719485662600532,
"grad_norm": 2144.53125,
"learning_rate": 2.1248280333794347e-06,
"loss": 24.8344,
"step": 632
},
{
"epoch": 0.24758598772826168,
"grad_norm": 1532.8892822265625,
"learning_rate": 2.1102758449432233e-06,
"loss": 48.1845,
"step": 633
},
{
"epoch": 0.247977118830518,
"grad_norm": 0.18892136216163635,
"learning_rate": 2.095760318705487e-06,
"loss": 41.4411,
"step": 634
},
{
"epoch": 0.24836824993277434,
"grad_norm": 0.43188950419425964,
"learning_rate": 2.081281638826052e-06,
"loss": 15.6649,
"step": 635
},
{
"epoch": 0.24875938103503067,
"grad_norm": 0.247590571641922,
"learning_rate": 2.0668399889972717e-06,
"loss": 52.5673,
"step": 636
},
{
"epoch": 0.249150512137287,
"grad_norm": 0.7205409407615662,
"learning_rate": 2.0524355524417017e-06,
"loss": 16.6089,
"step": 637
},
{
"epoch": 0.24954164323954336,
"grad_norm": 0.30670174956321716,
"learning_rate": 2.038068511909762e-06,
"loss": 25.2432,
"step": 638
},
{
"epoch": 0.2499327743417997,
"grad_norm": 1323.96875,
"learning_rate": 2.0237390496774284e-06,
"loss": 47.6592,
"step": 639
},
{
"epoch": 0.250323905444056,
"grad_norm": 0.4727286696434021,
"learning_rate": 2.00944734754392e-06,
"loss": 26.6763,
"step": 640
},
{
"epoch": 0.2507150365463124,
"grad_norm": 0.2739315927028656,
"learning_rate": 1.995193586829387e-06,
"loss": 60.4231,
"step": 641
},
{
"epoch": 0.2511061676485687,
"grad_norm": 0.31088632345199585,
"learning_rate": 1.980977948372612e-06,
"loss": 12.4899,
"step": 642
},
{
"epoch": 0.25149729875082505,
"grad_norm": 0.2787396013736725,
"learning_rate": 1.966800612528723e-06,
"loss": 52.5619,
"step": 643
},
{
"epoch": 0.25188842985308135,
"grad_norm": 0.15130284428596497,
"learning_rate": 1.952661759166893e-06,
"loss": 27.9581,
"step": 644
},
{
"epoch": 0.2522795609553377,
"grad_norm": 1.2856502532958984,
"learning_rate": 1.9385615676680663e-06,
"loss": 19.7204,
"step": 645
},
{
"epoch": 0.25267069205759407,
"grad_norm": 772.9840087890625,
"learning_rate": 1.9245002169226814e-06,
"loss": 47.7722,
"step": 646
},
{
"epoch": 0.2530618231598504,
"grad_norm": 342.9947509765625,
"learning_rate": 1.910477885328399e-06,
"loss": 20.2267,
"step": 647
},
{
"epoch": 0.25345295426210673,
"grad_norm": 0.65385502576828,
"learning_rate": 1.8964947507878401e-06,
"loss": 24.3055,
"step": 648
},
{
"epoch": 0.2538440853643631,
"grad_norm": 2241.752197265625,
"learning_rate": 1.8825509907063328e-06,
"loss": 61.2555,
"step": 649
},
{
"epoch": 0.2542352164666194,
"grad_norm": 1303.792724609375,
"learning_rate": 1.8686467819896542e-06,
"loss": 28.9474,
"step": 650
},
{
"epoch": 0.25462634756887575,
"grad_norm": 1.3625749349594116,
"learning_rate": 1.8547823010417876e-06,
"loss": 39.3853,
"step": 651
},
{
"epoch": 0.25501747867113206,
"grad_norm": 0.34665849804878235,
"learning_rate": 1.8409577237626935e-06,
"loss": 35.6182,
"step": 652
},
{
"epoch": 0.2554086097733884,
"grad_norm": 4.089036464691162,
"learning_rate": 1.8271732255460644e-06,
"loss": 4.3633,
"step": 653
},
{
"epoch": 0.2557997408756448,
"grad_norm": 1152.780517578125,
"learning_rate": 1.8134289812771077e-06,
"loss": 49.1108,
"step": 654
},
{
"epoch": 0.2561908719779011,
"grad_norm": 0.6453447341918945,
"learning_rate": 1.7997251653303249e-06,
"loss": 40.7015,
"step": 655
},
{
"epoch": 0.25658200308015744,
"grad_norm": 93.7674789428711,
"learning_rate": 1.7860619515673034e-06,
"loss": 30.0199,
"step": 656
},
{
"epoch": 0.25697313418241374,
"grad_norm": 0.4583728313446045,
"learning_rate": 1.7724395133345025e-06,
"loss": 67.6765,
"step": 657
},
{
"epoch": 0.2573642652846701,
"grad_norm": 0.5361828207969666,
"learning_rate": 1.7588580234610592e-06,
"loss": 17.3098,
"step": 658
},
{
"epoch": 0.25775539638692646,
"grad_norm": 0.22744417190551758,
"learning_rate": 1.7453176542565958e-06,
"loss": 38.1001,
"step": 659
},
{
"epoch": 0.25814652748918276,
"grad_norm": 0.23170587420463562,
"learning_rate": 1.7318185775090336e-06,
"loss": 10.4951,
"step": 660
},
{
"epoch": 0.2585376585914391,
"grad_norm": 153.93882751464844,
"learning_rate": 1.7183609644824096e-06,
"loss": 52.739,
"step": 661
},
{
"epoch": 0.2589287896936955,
"grad_norm": 0.1890573650598526,
"learning_rate": 1.7049449859147121e-06,
"loss": 29.1403,
"step": 662
},
{
"epoch": 0.2593199207959518,
"grad_norm": 1.4989005327224731,
"learning_rate": 1.6915708120157042e-06,
"loss": 44.5519,
"step": 663
},
{
"epoch": 0.25971105189820815,
"grad_norm": 1.5247461795806885,
"learning_rate": 1.67823861246477e-06,
"loss": 40.8943,
"step": 664
},
{
"epoch": 0.26010218300046445,
"grad_norm": 1.0584818124771118,
"learning_rate": 1.6649485564087646e-06,
"loss": 29.1124,
"step": 665
},
{
"epoch": 0.2604933141027208,
"grad_norm": 1.388893485069275,
"learning_rate": 1.6517008124598622e-06,
"loss": 20.1575,
"step": 666
},
{
"epoch": 0.26088444520497717,
"grad_norm": 0.27823394536972046,
"learning_rate": 1.6384955486934157e-06,
"loss": 19.5398,
"step": 667
},
{
"epoch": 0.26127557630723347,
"grad_norm": 0.1498628854751587,
"learning_rate": 1.6253329326458367e-06,
"loss": 43.7116,
"step": 668
},
{
"epoch": 0.26166670740948983,
"grad_norm": 5067.41357421875,
"learning_rate": 1.612213131312454e-06,
"loss": 62.5826,
"step": 669
},
{
"epoch": 0.26205783851174613,
"grad_norm": 0.3824155926704407,
"learning_rate": 1.5991363111454023e-06,
"loss": 43.7866,
"step": 670
},
{
"epoch": 0.2624489696140025,
"grad_norm": 0.30052921175956726,
"learning_rate": 1.5861026380515165e-06,
"loss": 80.1734,
"step": 671
},
{
"epoch": 0.26284010071625885,
"grad_norm": 0.14637216925621033,
"learning_rate": 1.5731122773902147e-06,
"loss": 17.5782,
"step": 672
},
{
"epoch": 0.26323123181851515,
"grad_norm": 0.14098778367042542,
"learning_rate": 1.5601653939714073e-06,
"loss": 52.0247,
"step": 673
},
{
"epoch": 0.2636223629207715,
"grad_norm": 331.9844055175781,
"learning_rate": 1.547262152053406e-06,
"loss": 32.7034,
"step": 674
},
{
"epoch": 0.2640134940230278,
"grad_norm": 0.5405360460281372,
"learning_rate": 1.5344027153408375e-06,
"loss": 49.8961,
"step": 675
},
{
"epoch": 0.2644046251252842,
"grad_norm": 0.21132820844650269,
"learning_rate": 1.5215872469825682e-06,
"loss": 4.232,
"step": 676
},
{
"epoch": 0.26479575622754054,
"grad_norm": 0.930949330329895,
"learning_rate": 1.5088159095696365e-06,
"loss": 43.6584,
"step": 677
},
{
"epoch": 0.26518688732979684,
"grad_norm": 0.6677309274673462,
"learning_rate": 1.4960888651331833e-06,
"loss": 40.9175,
"step": 678
},
{
"epoch": 0.2655780184320532,
"grad_norm": 0.35741525888442993,
"learning_rate": 1.4834062751424018e-06,
"loss": 22.68,
"step": 679
},
{
"epoch": 0.26596914953430956,
"grad_norm": 1111.9866943359375,
"learning_rate": 1.4707683005024898e-06,
"loss": 41.6105,
"step": 680
},
{
"epoch": 0.26636028063656586,
"grad_norm": 0.6054497957229614,
"learning_rate": 1.4581751015526035e-06,
"loss": 27.9255,
"step": 681
},
{
"epoch": 0.2667514117388222,
"grad_norm": 0.14572077989578247,
"learning_rate": 1.4456268380638262e-06,
"loss": 4.4698,
"step": 682
},
{
"epoch": 0.2671425428410785,
"grad_norm": 0.28749310970306396,
"learning_rate": 1.4331236692371386e-06,
"loss": 24.8405,
"step": 683
},
{
"epoch": 0.2675336739433349,
"grad_norm": 0.3515958786010742,
"learning_rate": 1.4206657537014078e-06,
"loss": 14.6068,
"step": 684
},
{
"epoch": 0.26792480504559124,
"grad_norm": 554.953857421875,
"learning_rate": 1.4082532495113627e-06,
"loss": 43.7761,
"step": 685
},
{
"epoch": 0.26831593614784754,
"grad_norm": 0.34047558903694153,
"learning_rate": 1.3958863141455937e-06,
"loss": 69.1389,
"step": 686
},
{
"epoch": 0.2687070672501039,
"grad_norm": 0.31358927488327026,
"learning_rate": 1.38356510450456e-06,
"loss": 48.2586,
"step": 687
},
{
"epoch": 0.2690981983523602,
"grad_norm": 0.18278853595256805,
"learning_rate": 1.3712897769085903e-06,
"loss": 72.0719,
"step": 688
},
{
"epoch": 0.26948932945461657,
"grad_norm": 1582.910888671875,
"learning_rate": 1.3590604870959046e-06,
"loss": 32.1289,
"step": 689
},
{
"epoch": 0.2698804605568729,
"grad_norm": 0.1720167100429535,
"learning_rate": 1.3468773902206378e-06,
"loss": 51.6635,
"step": 690
},
{
"epoch": 0.27027159165912923,
"grad_norm": 0.14479337632656097,
"learning_rate": 1.3347406408508695e-06,
"loss": 34.0947,
"step": 691
},
{
"epoch": 0.2706627227613856,
"grad_norm": 0.16732390224933624,
"learning_rate": 1.322650392966665e-06,
"loss": 15.8947,
"step": 692
},
{
"epoch": 0.27105385386364195,
"grad_norm": 0.9922708868980408,
"learning_rate": 1.3106067999581224e-06,
"loss": 23.1571,
"step": 693
},
{
"epoch": 0.27144498496589825,
"grad_norm": 971.8731689453125,
"learning_rate": 1.298610014623423e-06,
"loss": 52.2057,
"step": 694
},
{
"epoch": 0.2718361160681546,
"grad_norm": 824.1338500976562,
"learning_rate": 1.2866601891668945e-06,
"loss": 24.8422,
"step": 695
},
{
"epoch": 0.2722272471704109,
"grad_norm": 0.136683851480484,
"learning_rate": 1.2747574751970826e-06,
"loss": 11.3418,
"step": 696
},
{
"epoch": 0.27261837827266727,
"grad_norm": 0.8831171989440918,
"learning_rate": 1.2629020237248241e-06,
"loss": 27.774,
"step": 697
},
{
"epoch": 0.27300950937492363,
"grad_norm": 0.16165444254875183,
"learning_rate": 1.2510939851613285e-06,
"loss": 43.9631,
"step": 698
},
{
"epoch": 0.27340064047717993,
"grad_norm": 1.4575397968292236,
"learning_rate": 1.239333509316281e-06,
"loss": 60.9957,
"step": 699
},
{
"epoch": 0.2737917715794363,
"grad_norm": 0.35259395837783813,
"learning_rate": 1.2276207453959283e-06,
"loss": 50.4399,
"step": 700
},
{
"epoch": 0.2741829026816926,
"grad_norm": 0.22403627634048462,
"learning_rate": 1.2159558420011907e-06,
"loss": 19.9137,
"step": 701
},
{
"epoch": 0.27457403378394896,
"grad_norm": 1029.162353515625,
"learning_rate": 1.2043389471257833e-06,
"loss": 31.7524,
"step": 702
},
{
"epoch": 0.2749651648862053,
"grad_norm": 900.6036987304688,
"learning_rate": 1.1927702081543279e-06,
"loss": 11.9903,
"step": 703
},
{
"epoch": 0.2753562959884616,
"grad_norm": 0.16628578305244446,
"learning_rate": 1.1812497718604887e-06,
"loss": 11.8731,
"step": 704
},
{
"epoch": 0.275747427090718,
"grad_norm": 0.20883895456790924,
"learning_rate": 1.1697777844051105e-06,
"loss": 2.1834,
"step": 705
},
{
"epoch": 0.2761385581929743,
"grad_norm": 1.7050410509109497,
"learning_rate": 1.158354391334362e-06,
"loss": 18.1528,
"step": 706
},
{
"epoch": 0.27652968929523064,
"grad_norm": 1408.4571533203125,
"learning_rate": 1.1469797375778902e-06,
"loss": 37.1046,
"step": 707
},
{
"epoch": 0.276920820397487,
"grad_norm": 0.5376846790313721,
"learning_rate": 1.1356539674469852e-06,
"loss": 32.4544,
"step": 708
},
{
"epoch": 0.2773119514997433,
"grad_norm": 0.30538272857666016,
"learning_rate": 1.1243772246327416e-06,
"loss": 26.5578,
"step": 709
},
{
"epoch": 0.27770308260199966,
"grad_norm": 0.1639033704996109,
"learning_rate": 1.1131496522042424e-06,
"loss": 24.755,
"step": 710
},
{
"epoch": 0.278094213704256,
"grad_norm": 296.2695007324219,
"learning_rate": 1.1019713926067394e-06,
"loss": 37.8495,
"step": 711
},
{
"epoch": 0.2784853448065123,
"grad_norm": 0.9918115735054016,
"learning_rate": 1.0908425876598512e-06,
"loss": 32.8862,
"step": 712
},
{
"epoch": 0.2788764759087687,
"grad_norm": 803.0256958007812,
"learning_rate": 1.0797633785557582e-06,
"loss": 17.9585,
"step": 713
},
{
"epoch": 0.279267607011025,
"grad_norm": 0.4314124584197998,
"learning_rate": 1.068733905857413e-06,
"loss": 11.3658,
"step": 714
},
{
"epoch": 0.27965873811328135,
"grad_norm": 1402.580078125,
"learning_rate": 1.0577543094967613e-06,
"loss": 62.5657,
"step": 715
},
{
"epoch": 0.2800498692155377,
"grad_norm": 1.8656405210494995,
"learning_rate": 1.0468247287729593e-06,
"loss": 33.3876,
"step": 716
},
{
"epoch": 0.280441000317794,
"grad_norm": 0.4445607662200928,
"learning_rate": 1.0359453023506123e-06,
"loss": 9.0196,
"step": 717
},
{
"epoch": 0.28083213142005037,
"grad_norm": 0.5976535081863403,
"learning_rate": 1.0251161682580125e-06,
"loss": 21.402,
"step": 718
},
{
"epoch": 0.28122326252230667,
"grad_norm": 0.54698646068573,
"learning_rate": 1.0143374638853892e-06,
"loss": 26.4295,
"step": 719
},
{
"epoch": 0.28161439362456303,
"grad_norm": 0.8513966798782349,
"learning_rate": 1.0036093259831624e-06,
"loss": 32.0522,
"step": 720
},
{
"epoch": 0.2820055247268194,
"grad_norm": 0.45537883043289185,
"learning_rate": 9.929318906602176e-07,
"loss": 39.6447,
"step": 721
},
{
"epoch": 0.2823966558290757,
"grad_norm": 1060.1263427734375,
"learning_rate": 9.823052933821643e-07,
"loss": 37.912,
"step": 722
},
{
"epoch": 0.28278778693133205,
"grad_norm": 0.5889149308204651,
"learning_rate": 9.717296689696283e-07,
"loss": 33.4835,
"step": 723
},
{
"epoch": 0.28317891803358836,
"grad_norm": 1.4640785455703735,
"learning_rate": 9.612051515965388e-07,
"loss": 24.0596,
"step": 724
},
{
"epoch": 0.2835700491358447,
"grad_norm": 0.16718535125255585,
"learning_rate": 9.507318747884243e-07,
"loss": 30.542,
"step": 725
},
{
"epoch": 0.2839611802381011,
"grad_norm": 0.5365858674049377,
"learning_rate": 9.403099714207175e-07,
"loss": 43.143,
"step": 726
},
{
"epoch": 0.2843523113403574,
"grad_norm": 1128.4521484375,
"learning_rate": 9.299395737170758e-07,
"loss": 41.8304,
"step": 727
},
{
"epoch": 0.28474344244261374,
"grad_norm": 917.1222534179688,
"learning_rate": 9.196208132476963e-07,
"loss": 27.3092,
"step": 728
},
{
"epoch": 0.2851345735448701,
"grad_norm": 0.3612130582332611,
"learning_rate": 9.093538209276487e-07,
"loss": 22.8086,
"step": 729
},
{
"epoch": 0.2855257046471264,
"grad_norm": 0.26056137681007385,
"learning_rate": 8.991387270152202e-07,
"loss": 22.6093,
"step": 730
},
{
"epoch": 0.28591683574938276,
"grad_norm": 0.23313690721988678,
"learning_rate": 8.88975661110254e-07,
"loss": 20.9189,
"step": 731
},
{
"epoch": 0.28630796685163906,
"grad_norm": 0.1749650090932846,
"learning_rate": 8.78864752152509e-07,
"loss": 17.4111,
"step": 732
},
{
"epoch": 0.2866990979538954,
"grad_norm": 1060.3902587890625,
"learning_rate": 8.688061284200266e-07,
"loss": 78.8863,
"step": 733
},
{
"epoch": 0.2870902290561518,
"grad_norm": 0.22554989159107208,
"learning_rate": 8.587999175274986e-07,
"loss": 44.0863,
"step": 734
},
{
"epoch": 0.2874813601584081,
"grad_norm": 1059.94580078125,
"learning_rate": 8.488462464246495e-07,
"loss": 29.8665,
"step": 735
},
{
"epoch": 0.28787249126066444,
"grad_norm": 949.2442626953125,
"learning_rate": 8.389452413946314e-07,
"loss": 26.6799,
"step": 736
},
{
"epoch": 0.28826362236292075,
"grad_norm": 789.4210205078125,
"learning_rate": 8.290970280524124e-07,
"loss": 25.6707,
"step": 737
},
{
"epoch": 0.2886547534651771,
"grad_norm": 2.4044387340545654,
"learning_rate": 8.193017313431872e-07,
"loss": 10.1112,
"step": 738
},
{
"epoch": 0.28904588456743346,
"grad_norm": 324.33489990234375,
"learning_rate": 8.095594755407971e-07,
"loss": 11.7497,
"step": 739
},
{
"epoch": 0.28943701566968977,
"grad_norm": 1.1166508197784424,
"learning_rate": 7.99870384246143e-07,
"loss": 34.0988,
"step": 740
},
{
"epoch": 0.2898281467719461,
"grad_norm": 0.18654634058475494,
"learning_rate": 7.902345803856265e-07,
"loss": 45.7333,
"step": 741
},
{
"epoch": 0.2902192778742025,
"grad_norm": 0.3777371346950531,
"learning_rate": 7.806521862095834e-07,
"loss": 35.6395,
"step": 742
},
{
"epoch": 0.2906104089764588,
"grad_norm": 0.24813219904899597,
"learning_rate": 7.711233232907401e-07,
"loss": 29.0974,
"step": 743
},
{
"epoch": 0.29100154007871515,
"grad_norm": 0.2076377123594284,
"learning_rate": 7.616481125226632e-07,
"loss": 46.9587,
"step": 744
},
{
"epoch": 0.29139267118097145,
"grad_norm": 0.7038260102272034,
"learning_rate": 7.522266741182305e-07,
"loss": 21.8313,
"step": 745
},
{
"epoch": 0.2917838022832278,
"grad_norm": 217.7123565673828,
"learning_rate": 7.42859127608106e-07,
"loss": 6.3769,
"step": 746
},
{
"epoch": 0.29217493338548417,
"grad_norm": 0.12917295098304749,
"learning_rate": 7.33545591839222e-07,
"loss": 47.6111,
"step": 747
},
{
"epoch": 0.2925660644877405,
"grad_norm": 0.20550723373889923,
"learning_rate": 7.242861849732696e-07,
"loss": 64.3405,
"step": 748
},
{
"epoch": 0.29295719558999683,
"grad_norm": 1334.2445068359375,
"learning_rate": 7.150810244852036e-07,
"loss": 26.1011,
"step": 749
},
{
"epoch": 0.29334832669225314,
"grad_norm": 0.3111410439014435,
"learning_rate": 7.059302271617485e-07,
"loss": 35.9932,
"step": 750
},
{
"epoch": 0.2937394577945095,
"grad_norm": 942.8177490234375,
"learning_rate": 6.968339090999188e-07,
"loss": 31.1454,
"step": 751
},
{
"epoch": 0.29413058889676585,
"grad_norm": 810.7886962890625,
"learning_rate": 6.877921857055476e-07,
"loss": 37.078,
"step": 752
},
{
"epoch": 0.29452171999902216,
"grad_norm": 306.9100036621094,
"learning_rate": 6.78805171691817e-07,
"loss": 34.2818,
"step": 753
},
{
"epoch": 0.2949128511012785,
"grad_norm": 0.9455443620681763,
"learning_rate": 6.698729810778065e-07,
"loss": 13.8228,
"step": 754
},
{
"epoch": 0.2953039822035348,
"grad_norm": 0.8926202654838562,
"learning_rate": 6.609957271870505e-07,
"loss": 48.3806,
"step": 755
},
{
"epoch": 0.2956951133057912,
"grad_norm": 217.76356506347656,
"learning_rate": 6.521735226460901e-07,
"loss": 61.1602,
"step": 756
},
{
"epoch": 0.29608624440804754,
"grad_norm": 0.5974970459938049,
"learning_rate": 6.43406479383053e-07,
"loss": 14.1005,
"step": 757
},
{
"epoch": 0.29647737551030384,
"grad_norm": 1.2981594800949097,
"learning_rate": 6.346947086262323e-07,
"loss": 24.2989,
"step": 758
},
{
"epoch": 0.2968685066125602,
"grad_norm": 0.1865173578262329,
"learning_rate": 6.260383209026704e-07,
"loss": 44.4224,
"step": 759
},
{
"epoch": 0.29725963771481656,
"grad_norm": 0.39853161573410034,
"learning_rate": 6.174374260367611e-07,
"loss": 37.0987,
"step": 760
},
{
"epoch": 0.29765076881707286,
"grad_norm": 0.5360068678855896,
"learning_rate": 6.088921331488568e-07,
"loss": 19.2739,
"step": 761
},
{
"epoch": 0.2980418999193292,
"grad_norm": 840.9296875,
"learning_rate": 6.004025506538813e-07,
"loss": 24.1344,
"step": 762
},
{
"epoch": 0.2984330310215855,
"grad_norm": 0.38888782262802124,
"learning_rate": 5.919687862599549e-07,
"loss": 37.5647,
"step": 763
},
{
"epoch": 0.2988241621238419,
"grad_norm": 1.1136953830718994,
"learning_rate": 5.835909469670292e-07,
"loss": 39.4938,
"step": 764
},
{
"epoch": 0.29921529322609824,
"grad_norm": 0.21624325215816498,
"learning_rate": 5.752691390655279e-07,
"loss": 58.6849,
"step": 765
},
{
"epoch": 0.29960642432835455,
"grad_norm": 0.725331723690033,
"learning_rate": 5.670034681349995e-07,
"loss": 15.0299,
"step": 766
},
{
"epoch": 0.2999975554306109,
"grad_norm": 0.4943119287490845,
"learning_rate": 5.587940390427804e-07,
"loss": 22.3662,
"step": 767
},
{
"epoch": 0.3003886865328672,
"grad_norm": 0.3493794798851013,
"learning_rate": 5.506409559426573e-07,
"loss": 5.3336,
"step": 768
},
{
"epoch": 0.30077981763512357,
"grad_norm": 0.16925600171089172,
"learning_rate": 5.425443222735527e-07,
"loss": 24.5783,
"step": 769
},
{
"epoch": 0.30117094873737993,
"grad_norm": 0.42793917655944824,
"learning_rate": 5.345042407582079e-07,
"loss": 22.6456,
"step": 770
},
{
"epoch": 0.30156207983963623,
"grad_norm": 1158.593994140625,
"learning_rate": 5.265208134018851e-07,
"loss": 28.3467,
"step": 771
},
{
"epoch": 0.3019532109418926,
"grad_norm": 1398.79248046875,
"learning_rate": 5.185941414910673e-07,
"loss": 13.612,
"step": 772
},
{
"epoch": 0.30234434204414895,
"grad_norm": 0.3084481954574585,
"learning_rate": 5.107243255921746e-07,
"loss": 40.542,
"step": 773
},
{
"epoch": 0.30273547314640525,
"grad_norm": 0.6440997123718262,
"learning_rate": 5.029114655502937e-07,
"loss": 23.8275,
"step": 774
},
{
"epoch": 0.3031266042486616,
"grad_norm": 582.021484375,
"learning_rate": 4.951556604879049e-07,
"loss": 52.9339,
"step": 775
},
{
"epoch": 0.3035177353509179,
"grad_norm": 297.0395202636719,
"learning_rate": 4.874570088036252e-07,
"loss": 14.6142,
"step": 776
},
{
"epoch": 0.3039088664531743,
"grad_norm": 0.25378304719924927,
"learning_rate": 4.798156081709638e-07,
"loss": 4.0975,
"step": 777
},
{
"epoch": 0.30429999755543063,
"grad_norm": 0.7152448892593384,
"learning_rate": 4.722315555370793e-07,
"loss": 14.747,
"step": 778
},
{
"epoch": 0.30469112865768694,
"grad_norm": 750.841064453125,
"learning_rate": 4.647049471215498e-07,
"loss": 10.1092,
"step": 779
},
{
"epoch": 0.3050822597599433,
"grad_norm": 0.16996781527996063,
"learning_rate": 4.5723587841515707e-07,
"loss": 8.5861,
"step": 780
},
{
"epoch": 0.3054733908621996,
"grad_norm": 0.2573756277561188,
"learning_rate": 4.4982444417866753e-07,
"loss": 28.2554,
"step": 781
},
{
"epoch": 0.30586452196445596,
"grad_norm": 0.8471802473068237,
"learning_rate": 4.4247073844163434e-07,
"loss": 23.4402,
"step": 782
},
{
"epoch": 0.3062556530667123,
"grad_norm": 711.4560546875,
"learning_rate": 4.351748545012058e-07,
"loss": 35.54,
"step": 783
},
{
"epoch": 0.3066467841689686,
"grad_norm": 660.20458984375,
"learning_rate": 4.279368849209381e-07,
"loss": 32.8363,
"step": 784
},
{
"epoch": 0.307037915271225,
"grad_norm": 60.09021759033203,
"learning_rate": 4.2075692152962145e-07,
"loss": 25.1056,
"step": 785
},
{
"epoch": 0.3074290463734813,
"grad_norm": 0.995664119720459,
"learning_rate": 4.136350554201196e-07,
"loss": 18.2107,
"step": 786
},
{
"epoch": 0.30782017747573764,
"grad_norm": 1.327789545059204,
"learning_rate": 4.0657137694820826e-07,
"loss": 31.5689,
"step": 787
},
{
"epoch": 0.308211308577994,
"grad_norm": 0.5742402076721191,
"learning_rate": 3.9956597573142966e-07,
"loss": 9.6179,
"step": 788
},
{
"epoch": 0.3086024396802503,
"grad_norm": 0.27793997526168823,
"learning_rate": 3.9261894064796136e-07,
"loss": 32.6346,
"step": 789
},
{
"epoch": 0.30899357078250667,
"grad_norm": 1185.6458740234375,
"learning_rate": 3.8573035983548167e-07,
"loss": 38.3132,
"step": 790
},
{
"epoch": 0.309384701884763,
"grad_norm": 1311.600830078125,
"learning_rate": 3.789003206900538e-07,
"loss": 44.8882,
"step": 791
},
{
"epoch": 0.30977583298701933,
"grad_norm": 0.7463202476501465,
"learning_rate": 3.7212890986501773e-07,
"loss": 33.5164,
"step": 792
},
{
"epoch": 0.3101669640892757,
"grad_norm": 0.1619306206703186,
"learning_rate": 3.6541621326989183e-07,
"loss": 8.8827,
"step": 793
},
{
"epoch": 0.310558095191532,
"grad_norm": 0.290446937084198,
"learning_rate": 3.5876231606927936e-07,
"loss": 22.0263,
"step": 794
},
{
"epoch": 0.31094922629378835,
"grad_norm": 0.5374624729156494,
"learning_rate": 3.5216730268179346e-07,
"loss": 54.9825,
"step": 795
},
{
"epoch": 0.3113403573960447,
"grad_norm": 0.45844340324401855,
"learning_rate": 3.4563125677897936e-07,
"loss": 34.1724,
"step": 796
},
{
"epoch": 0.311731488498301,
"grad_norm": 0.21980485320091248,
"learning_rate": 3.3915426128425744e-07,
"loss": 23.5609,
"step": 797
},
{
"epoch": 0.31212261960055737,
"grad_norm": 0.1387357860803604,
"learning_rate": 3.327363983718723e-07,
"loss": 3.3226,
"step": 798
},
{
"epoch": 0.3125137507028137,
"grad_norm": 703.876708984375,
"learning_rate": 3.263777494658449e-07,
"loss": 14.627,
"step": 799
},
{
"epoch": 0.31290488180507003,
"grad_norm": 0.3398139178752899,
"learning_rate": 3.200783952389447e-07,
"loss": 23.2432,
"step": 800
},
{
"epoch": 0.3132960129073264,
"grad_norm": 0.2651851177215576,
"learning_rate": 3.138384156116614e-07,
"loss": 22.8204,
"step": 801
},
{
"epoch": 0.3136871440095827,
"grad_norm": 0.7338706254959106,
"learning_rate": 3.076578897511978e-07,
"loss": 8.9847,
"step": 802
},
{
"epoch": 0.31407827511183906,
"grad_norm": 674.1719360351562,
"learning_rate": 3.015368960704584e-07,
"loss": 42.1909,
"step": 803
},
{
"epoch": 0.31446940621409536,
"grad_norm": 0.5109131932258606,
"learning_rate": 2.954755122270564e-07,
"loss": 16.502,
"step": 804
},
{
"epoch": 0.3148605373163517,
"grad_norm": 38.51890563964844,
"learning_rate": 2.894738151223331e-07,
"loss": 22.2199,
"step": 805
},
{
"epoch": 0.3152516684186081,
"grad_norm": 236.9777069091797,
"learning_rate": 2.835318809003751e-07,
"loss": 61.7173,
"step": 806
},
{
"epoch": 0.3156427995208644,
"grad_norm": 0.13343866169452667,
"learning_rate": 2.776497849470544e-07,
"loss": 14.8311,
"step": 807
},
{
"epoch": 0.31603393062312074,
"grad_norm": 0.2250102311372757,
"learning_rate": 2.71827601889067e-07,
"loss": 28.0406,
"step": 808
},
{
"epoch": 0.3164250617253771,
"grad_norm": 0.3997911810874939,
"learning_rate": 2.6606540559298956e-07,
"loss": 32.8142,
"step": 809
},
{
"epoch": 0.3168161928276334,
"grad_norm": 947.511474609375,
"learning_rate": 2.6036326916434153e-07,
"loss": 32.0284,
"step": 810
},
{
"epoch": 0.31720732392988976,
"grad_norm": 0.18809598684310913,
"learning_rate": 2.547212649466568e-07,
"loss": 46.4845,
"step": 811
},
{
"epoch": 0.31759845503214607,
"grad_norm": 105.57866668701172,
"learning_rate": 2.491394645205669e-07,
"loss": 36.195,
"step": 812
},
{
"epoch": 0.3179895861344024,
"grad_norm": 700.2158203125,
"learning_rate": 2.436179387028903e-07,
"loss": 38.9557,
"step": 813
},
{
"epoch": 0.3183807172366588,
"grad_norm": 778.9248657226562,
"learning_rate": 2.3815675754573885e-07,
"loss": 22.1128,
"step": 814
},
{
"epoch": 0.3187718483389151,
"grad_norm": 0.24114812910556793,
"learning_rate": 2.3275599033562414e-07,
"loss": 40.5305,
"step": 815
},
{
"epoch": 0.31916297944117145,
"grad_norm": 539.972900390625,
"learning_rate": 2.274157055925802e-07,
"loss": 38.5275,
"step": 816
},
{
"epoch": 0.31955411054342775,
"grad_norm": 0.5195923447608948,
"learning_rate": 2.2213597106929608e-07,
"loss": 31.244,
"step": 817
},
{
"epoch": 0.3199452416456841,
"grad_norm": 242.19908142089844,
"learning_rate": 2.1691685375025362e-07,
"loss": 39.8784,
"step": 818
},
{
"epoch": 0.32033637274794047,
"grad_norm": 0.1984694004058838,
"learning_rate": 2.117584198508771e-07,
"loss": 30.9005,
"step": 819
},
{
"epoch": 0.32072750385019677,
"grad_norm": 987.247802734375,
"learning_rate": 2.0666073481669714e-07,
"loss": 32.3119,
"step": 820
},
{
"epoch": 0.32111863495245313,
"grad_norm": 1.042695164680481,
"learning_rate": 2.016238633225165e-07,
"loss": 41.9533,
"step": 821
},
{
"epoch": 0.3215097660547095,
"grad_norm": 0.18694262206554413,
"learning_rate": 1.9664786927159064e-07,
"loss": 55.5923,
"step": 822
},
{
"epoch": 0.3219008971569658,
"grad_norm": 0.35913291573524475,
"learning_rate": 1.9173281579481896e-07,
"loss": 17.7606,
"step": 823
},
{
"epoch": 0.32229202825922215,
"grad_norm": 0.1270519196987152,
"learning_rate": 1.8687876524993987e-07,
"loss": 6.9452,
"step": 824
},
{
"epoch": 0.32268315936147846,
"grad_norm": 0.25847911834716797,
"learning_rate": 1.820857792207431e-07,
"loss": 11.1015,
"step": 825
},
{
"epoch": 0.3230742904637348,
"grad_norm": 507.70074462890625,
"learning_rate": 1.7735391851628814e-07,
"loss": 32.347,
"step": 826
},
{
"epoch": 0.3234654215659912,
"grad_norm": 90.63712310791016,
"learning_rate": 1.7268324317012974e-07,
"loss": 16.0917,
"step": 827
},
{
"epoch": 0.3238565526682475,
"grad_norm": 0.3610641658306122,
"learning_rate": 1.680738124395598e-07,
"loss": 10.4453,
"step": 828
},
{
"epoch": 0.32424768377050384,
"grad_norm": 1.101544976234436,
"learning_rate": 1.6352568480485277e-07,
"loss": 3.3177,
"step": 829
},
{
"epoch": 0.32463881487276014,
"grad_norm": 1.0258104801177979,
"learning_rate": 1.5903891796852756e-07,
"loss": 30.9836,
"step": 830
},
{
"epoch": 0.3250299459750165,
"grad_norm": 0.22194243967533112,
"learning_rate": 1.5461356885461077e-07,
"loss": 11.0591,
"step": 831
},
{
"epoch": 0.32542107707727286,
"grad_norm": 527.8275146484375,
"learning_rate": 1.5024969360791564e-07,
"loss": 48.5131,
"step": 832
},
{
"epoch": 0.32581220817952916,
"grad_norm": 0.20000404119491577,
"learning_rate": 1.4594734759333484e-07,
"loss": 33.9883,
"step": 833
},
{
"epoch": 0.3262033392817855,
"grad_norm": 0.5714410543441772,
"learning_rate": 1.4170658539512993e-07,
"loss": 19.7609,
"step": 834
},
{
"epoch": 0.3265944703840418,
"grad_norm": 1.373691439628601,
"learning_rate": 1.375274608162447e-07,
"loss": 33.721,
"step": 835
},
{
"epoch": 0.3269856014862982,
"grad_norm": 0.5083162188529968,
"learning_rate": 1.3341002687762062e-07,
"loss": 21.9135,
"step": 836
},
{
"epoch": 0.32737673258855454,
"grad_norm": 1.2598671913146973,
"learning_rate": 1.2935433581752365e-07,
"loss": 24.6308,
"step": 837
},
{
"epoch": 0.32776786369081085,
"grad_norm": 615.4888305664062,
"learning_rate": 1.253604390908819e-07,
"loss": 33.5934,
"step": 838
},
{
"epoch": 0.3281589947930672,
"grad_norm": 0.8651353716850281,
"learning_rate": 1.2142838736863562e-07,
"loss": 79.5808,
"step": 839
},
{
"epoch": 0.32855012589532356,
"grad_norm": 0.21397706866264343,
"learning_rate": 1.175582305370887e-07,
"loss": 30.6943,
"step": 840
},
{
"epoch": 0.32894125699757987,
"grad_norm": 748.9775390625,
"learning_rate": 1.1375001769728e-07,
"loss": 30.7896,
"step": 841
},
{
"epoch": 0.3293323880998362,
"grad_norm": 0.20578625798225403,
"learning_rate": 1.1000379716435916e-07,
"loss": 28.9501,
"step": 842
},
{
"epoch": 0.32972351920209253,
"grad_norm": 0.34230169653892517,
"learning_rate": 1.0631961646697387e-07,
"loss": 6.3882,
"step": 843
},
{
"epoch": 0.3301146503043489,
"grad_norm": 136.2682342529297,
"learning_rate": 1.0269752234666642e-07,
"loss": 9.9568,
"step": 844
},
{
"epoch": 0.33050578140660525,
"grad_norm": 536.7844848632812,
"learning_rate": 9.913756075728088e-08,
"loss": 42.5157,
"step": 845
},
{
"epoch": 0.33089691250886155,
"grad_norm": 0.6589470505714417,
"learning_rate": 9.563977686438019e-08,
"loss": 15.0413,
"step": 846
},
{
"epoch": 0.3312880436111179,
"grad_norm": 0.44913989305496216,
"learning_rate": 9.22042150446728e-08,
"loss": 22.3091,
"step": 847
},
{
"epoch": 0.3316791747133742,
"grad_norm": 0.11731698364019394,
"learning_rate": 8.883091888545136e-08,
"loss": 26.5784,
"step": 848
},
{
"epoch": 0.3320703058156306,
"grad_norm": 0.27501413226127625,
"learning_rate": 8.551993118403656e-08,
"loss": 18.7753,
"step": 849
},
{
"epoch": 0.33246143691788693,
"grad_norm": 325.61627197265625,
"learning_rate": 8.227129394723643e-08,
"loss": 22.7822,
"step": 850
},
{
"epoch": 0.33285256802014324,
"grad_norm": 0.2187097817659378,
"learning_rate": 7.908504839081343e-08,
"loss": 21.9867,
"step": 851
},
{
"epoch": 0.3332436991223996,
"grad_norm": 476.84649658203125,
"learning_rate": 7.59612349389599e-08,
"loss": 34.4312,
"step": 852
},
{
"epoch": 0.33363483022465595,
"grad_norm": 0.8119810223579407,
"learning_rate": 7.289989322378732e-08,
"loss": 9.815,
"step": 853
},
{
"epoch": 0.33402596132691226,
"grad_norm": 0.5497637987136841,
"learning_rate": 6.990106208482227e-08,
"loss": 25.6327,
"step": 854
},
{
"epoch": 0.3344170924291686,
"grad_norm": 0.14091427624225616,
"learning_rate": 6.696477956851356e-08,
"loss": 17.0462,
"step": 855
},
{
"epoch": 0.3348082235314249,
"grad_norm": 0.36118146777153015,
"learning_rate": 6.409108292774912e-08,
"loss": 29.5031,
"step": 856
},
{
"epoch": 0.3351993546336813,
"grad_norm": 0.4133693277835846,
"learning_rate": 6.12800086213866e-08,
"loss": 23.7123,
"step": 857
},
{
"epoch": 0.33559048573593764,
"grad_norm": 1.1946247816085815,
"learning_rate": 5.853159231378469e-08,
"loss": 46.5555,
"step": 858
},
{
"epoch": 0.33598161683819394,
"grad_norm": 0.21554416418075562,
"learning_rate": 5.584586887435739e-08,
"loss": 17.6013,
"step": 859
},
{
"epoch": 0.3363727479404503,
"grad_norm": 0.16092292964458466,
"learning_rate": 5.322287237712664e-08,
"loss": 10.5376,
"step": 860
},
{
"epoch": 0.3367638790427066,
"grad_norm": 464.0669860839844,
"learning_rate": 5.0662636100292094e-08,
"loss": 36.5366,
"step": 861
},
{
"epoch": 0.33715501014496296,
"grad_norm": 0.7112561464309692,
"learning_rate": 4.8165192525809754e-08,
"loss": 32.8205,
"step": 862
},
{
"epoch": 0.3375461412472193,
"grad_norm": 0.18869797885417938,
"learning_rate": 4.573057333897679e-08,
"loss": 11.1364,
"step": 863
},
{
"epoch": 0.3379372723494756,
"grad_norm": 0.11357381194829941,
"learning_rate": 4.335880942803405e-08,
"loss": 15.2307,
"step": 864
},
{
"epoch": 0.338328403451732,
"grad_norm": 0.2633415162563324,
"learning_rate": 4.104993088376974e-08,
"loss": 11.5664,
"step": 865
},
{
"epoch": 0.3387195345539883,
"grad_norm": 1.018863320350647,
"learning_rate": 3.8803966999139686e-08,
"loss": 8.2019,
"step": 866
},
{
"epoch": 0.33911066565624465,
"grad_norm": 1.0014698505401611,
"learning_rate": 3.662094626889656e-08,
"loss": 14.888,
"step": 867
},
{
"epoch": 0.339501796758501,
"grad_norm": 0.7487742900848389,
"learning_rate": 3.450089638922738e-08,
"loss": 51.2638,
"step": 868
},
{
"epoch": 0.3398929278607573,
"grad_norm": 532.7721557617188,
"learning_rate": 3.2443844257400434e-08,
"loss": 23.042,
"step": 869
},
{
"epoch": 0.34028405896301367,
"grad_norm": 0.3149930536746979,
"learning_rate": 3.044981597142837e-08,
"loss": 4.3578,
"step": 870
},
{
"epoch": 0.34067519006527003,
"grad_norm": 1.0493534803390503,
"learning_rate": 2.8518836829732332e-08,
"loss": 25.666,
"step": 871
},
{
"epoch": 0.34106632116752633,
"grad_norm": 582.3631591796875,
"learning_rate": 2.6650931330823305e-08,
"loss": 31.6531,
"step": 872
},
{
"epoch": 0.3414574522697827,
"grad_norm": 680.9072265625,
"learning_rate": 2.4846123172992953e-08,
"loss": 22.6484,
"step": 873
},
{
"epoch": 0.341848583372039,
"grad_norm": 0.22280123829841614,
"learning_rate": 2.3104435254008852e-08,
"loss": 14.0636,
"step": 874
},
{
"epoch": 0.34223971447429535,
"grad_norm": 0.26472118496894836,
"learning_rate": 2.1425889670827483e-08,
"loss": 19.032,
"step": 875
},
{
"epoch": 0.3426308455765517,
"grad_norm": 238.65916442871094,
"learning_rate": 1.981050771931281e-08,
"loss": 10.7904,
"step": 876
},
{
"epoch": 0.343021976678808,
"grad_norm": 0.1643354743719101,
"learning_rate": 1.8258309893965375e-08,
"loss": 14.425,
"step": 877
},
{
"epoch": 0.3434131077810644,
"grad_norm": 0.3684662878513336,
"learning_rate": 1.6769315887662508e-08,
"loss": 33.7941,
"step": 878
},
{
"epoch": 0.3438042388833207,
"grad_norm": 0.7127506136894226,
"learning_rate": 1.5343544591409632e-08,
"loss": 21.4788,
"step": 879
},
{
"epoch": 0.34419536998557704,
"grad_norm": 0.591823160648346,
"learning_rate": 1.3981014094099354e-08,
"loss": 1.5929,
"step": 880
},
{
"epoch": 0.3445865010878334,
"grad_norm": 0.8882282972335815,
"learning_rate": 1.2681741682282755e-08,
"loss": 16.2899,
"step": 881
},
{
"epoch": 0.3449776321900897,
"grad_norm": 0.4288715720176697,
"learning_rate": 1.1445743839949008e-08,
"loss": 11.78,
"step": 882
},
{
"epoch": 0.34536876329234606,
"grad_norm": 479.3231506347656,
"learning_rate": 1.0273036248318325e-08,
"loss": 27.5783,
"step": 883
},
{
"epoch": 0.3457598943946024,
"grad_norm": 0.7523378133773804,
"learning_rate": 9.163633785639892e-09,
"loss": 31.4348,
"step": 884
},
{
"epoch": 0.3461510254968587,
"grad_norm": 0.4846894145011902,
"learning_rate": 8.117550527005913e-09,
"loss": 2.143,
"step": 885
},
{
"epoch": 0.3465421565991151,
"grad_norm": 229.96755981445312,
"learning_rate": 7.13479974417175e-09,
"loss": 30.5446,
"step": 886
},
{
"epoch": 0.3469332877013714,
"grad_norm": 0.1374731957912445,
"learning_rate": 6.215393905388278e-09,
"loss": 22.3179,
"step": 887
},
{
"epoch": 0.34732441880362774,
"grad_norm": 0.76992267370224,
"learning_rate": 5.359344675242018e-09,
"loss": 14.0893,
"step": 888
},
{
"epoch": 0.3477155499058841,
"grad_norm": 476.39459228515625,
"learning_rate": 4.56666291450858e-09,
"loss": 52.4586,
"step": 889
},
{
"epoch": 0.3481066810081404,
"grad_norm": 71.57829284667969,
"learning_rate": 3.837358680016112e-09,
"loss": 19.4455,
"step": 890
},
{
"epoch": 0.34849781211039677,
"grad_norm": 0.3923113942146301,
"learning_rate": 3.1714412245148486e-09,
"loss": 43.8042,
"step": 891
},
{
"epoch": 0.34888894321265307,
"grad_norm": 0.11082535982131958,
"learning_rate": 2.568918996560532e-09,
"loss": 21.0263,
"step": 892
},
{
"epoch": 0.34928007431490943,
"grad_norm": 0.5106098651885986,
"learning_rate": 2.029799640409502e-09,
"loss": 21.4105,
"step": 893
},
{
"epoch": 0.3496712054171658,
"grad_norm": 0.2212674468755722,
"learning_rate": 1.5540899959187727e-09,
"loss": 29.8404,
"step": 894
},
{
"epoch": 0.3500623365194221,
"grad_norm": 0.4330773949623108,
"learning_rate": 1.1417960984605459e-09,
"loss": 28.9654,
"step": 895
},
{
"epoch": 0.35045346762167845,
"grad_norm": 0.1814635843038559,
"learning_rate": 7.92923178845606e-10,
"loss": 25.6656,
"step": 896
},
{
"epoch": 0.35084459872393475,
"grad_norm": 538.5513916015625,
"learning_rate": 5.07475663257262e-10,
"loss": 32.209,
"step": 897
},
{
"epoch": 0.3512357298261911,
"grad_norm": 536.6629028320312,
"learning_rate": 2.854571731947253e-10,
"loss": 13.3711,
"step": 898
},
{
"epoch": 0.35162686092844747,
"grad_norm": 598.1375732421875,
"learning_rate": 1.2687052542759148e-10,
"loss": 27.3026,
"step": 899
},
{
"epoch": 0.3520179920307038,
"grad_norm": 0.18695542216300964,
"learning_rate": 3.171773195809191e-11,
"loss": 31.2123,
"step": 900
},
{
"epoch": 0.3520179920307038,
"step": 900,
"total_flos": 9.027346976391299e+18,
"train_loss": 213.8077490248945,
"train_runtime": 76099.1347,
"train_samples_per_second": 3.028,
"train_steps_per_second": 0.012
}
],
"logging_steps": 1.0,
"max_steps": 900,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.027346976391299e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}