marianna13's picture
Upload folder using huggingface_hub
7940815 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1389,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021598272138228943,
"grad_norm": 2.878943681716919,
"learning_rate": 7.194244604316547e-08,
"loss": 0.7835,
"step": 1
},
{
"epoch": 0.004319654427645789,
"grad_norm": 2.902249813079834,
"learning_rate": 1.4388489208633095e-07,
"loss": 0.7896,
"step": 2
},
{
"epoch": 0.0064794816414686825,
"grad_norm": 2.8640873432159424,
"learning_rate": 2.1582733812949643e-07,
"loss": 0.7588,
"step": 3
},
{
"epoch": 0.008639308855291577,
"grad_norm": 2.825040102005005,
"learning_rate": 2.877697841726619e-07,
"loss": 0.779,
"step": 4
},
{
"epoch": 0.01079913606911447,
"grad_norm": 2.9926884174346924,
"learning_rate": 3.5971223021582736e-07,
"loss": 0.7816,
"step": 5
},
{
"epoch": 0.012958963282937365,
"grad_norm": 2.8692467212677,
"learning_rate": 4.3165467625899287e-07,
"loss": 0.7695,
"step": 6
},
{
"epoch": 0.01511879049676026,
"grad_norm": 2.79731822013855,
"learning_rate": 5.035971223021583e-07,
"loss": 0.7683,
"step": 7
},
{
"epoch": 0.017278617710583154,
"grad_norm": 2.832988739013672,
"learning_rate": 5.755395683453238e-07,
"loss": 0.7865,
"step": 8
},
{
"epoch": 0.019438444924406047,
"grad_norm": 2.787931442260742,
"learning_rate": 6.474820143884893e-07,
"loss": 0.7716,
"step": 9
},
{
"epoch": 0.02159827213822894,
"grad_norm": 2.6542158126831055,
"learning_rate": 7.194244604316547e-07,
"loss": 0.7708,
"step": 10
},
{
"epoch": 0.023758099352051837,
"grad_norm": 2.5756170749664307,
"learning_rate": 7.913669064748202e-07,
"loss": 0.7548,
"step": 11
},
{
"epoch": 0.02591792656587473,
"grad_norm": 2.2221007347106934,
"learning_rate": 8.633093525179857e-07,
"loss": 0.7544,
"step": 12
},
{
"epoch": 0.028077753779697623,
"grad_norm": 2.165950298309326,
"learning_rate": 9.352517985611512e-07,
"loss": 0.7345,
"step": 13
},
{
"epoch": 0.03023758099352052,
"grad_norm": 2.1415212154388428,
"learning_rate": 1.0071942446043167e-06,
"loss": 0.7375,
"step": 14
},
{
"epoch": 0.032397408207343416,
"grad_norm": 2.045217275619507,
"learning_rate": 1.079136690647482e-06,
"loss": 0.7251,
"step": 15
},
{
"epoch": 0.03455723542116631,
"grad_norm": 1.8833245038986206,
"learning_rate": 1.1510791366906476e-06,
"loss": 0.734,
"step": 16
},
{
"epoch": 0.0367170626349892,
"grad_norm": 1.4383106231689453,
"learning_rate": 1.2230215827338131e-06,
"loss": 0.7126,
"step": 17
},
{
"epoch": 0.038876889848812095,
"grad_norm": 1.3764389753341675,
"learning_rate": 1.2949640287769785e-06,
"loss": 0.69,
"step": 18
},
{
"epoch": 0.04103671706263499,
"grad_norm": 1.3699392080307007,
"learning_rate": 1.366906474820144e-06,
"loss": 0.7071,
"step": 19
},
{
"epoch": 0.04319654427645788,
"grad_norm": 1.2943273782730103,
"learning_rate": 1.4388489208633094e-06,
"loss": 0.686,
"step": 20
},
{
"epoch": 0.04535637149028078,
"grad_norm": 1.2634108066558838,
"learning_rate": 1.510791366906475e-06,
"loss": 0.6902,
"step": 21
},
{
"epoch": 0.047516198704103674,
"grad_norm": 1.066751480102539,
"learning_rate": 1.5827338129496403e-06,
"loss": 0.6644,
"step": 22
},
{
"epoch": 0.04967602591792657,
"grad_norm": 1.004930019378662,
"learning_rate": 1.654676258992806e-06,
"loss": 0.6602,
"step": 23
},
{
"epoch": 0.05183585313174946,
"grad_norm": 0.9834485054016113,
"learning_rate": 1.7266187050359715e-06,
"loss": 0.6525,
"step": 24
},
{
"epoch": 0.05399568034557235,
"grad_norm": 0.9758538007736206,
"learning_rate": 1.7985611510791368e-06,
"loss": 0.6452,
"step": 25
},
{
"epoch": 0.056155507559395246,
"grad_norm": 0.9222759008407593,
"learning_rate": 1.8705035971223024e-06,
"loss": 0.6485,
"step": 26
},
{
"epoch": 0.058315334773218146,
"grad_norm": 0.8775356411933899,
"learning_rate": 1.942446043165468e-06,
"loss": 0.6388,
"step": 27
},
{
"epoch": 0.06047516198704104,
"grad_norm": 0.8008519411087036,
"learning_rate": 2.0143884892086333e-06,
"loss": 0.6328,
"step": 28
},
{
"epoch": 0.06263498920086392,
"grad_norm": 0.7609057426452637,
"learning_rate": 2.0863309352517987e-06,
"loss": 0.6253,
"step": 29
},
{
"epoch": 0.06479481641468683,
"grad_norm": 0.6197890043258667,
"learning_rate": 2.158273381294964e-06,
"loss": 0.6253,
"step": 30
},
{
"epoch": 0.06695464362850972,
"grad_norm": 0.6675652265548706,
"learning_rate": 2.23021582733813e-06,
"loss": 0.605,
"step": 31
},
{
"epoch": 0.06911447084233262,
"grad_norm": 0.6976248621940613,
"learning_rate": 2.302158273381295e-06,
"loss": 0.6077,
"step": 32
},
{
"epoch": 0.07127429805615551,
"grad_norm": 0.6653661131858826,
"learning_rate": 2.3741007194244605e-06,
"loss": 0.6021,
"step": 33
},
{
"epoch": 0.0734341252699784,
"grad_norm": 0.6243202090263367,
"learning_rate": 2.4460431654676263e-06,
"loss": 0.6147,
"step": 34
},
{
"epoch": 0.0755939524838013,
"grad_norm": 0.5303459167480469,
"learning_rate": 2.5179856115107916e-06,
"loss": 0.6,
"step": 35
},
{
"epoch": 0.07775377969762419,
"grad_norm": 0.48958107829093933,
"learning_rate": 2.589928057553957e-06,
"loss": 0.5829,
"step": 36
},
{
"epoch": 0.07991360691144708,
"grad_norm": 0.4979974031448364,
"learning_rate": 2.6618705035971228e-06,
"loss": 0.5852,
"step": 37
},
{
"epoch": 0.08207343412526998,
"grad_norm": 0.508642852306366,
"learning_rate": 2.733812949640288e-06,
"loss": 0.5827,
"step": 38
},
{
"epoch": 0.08423326133909287,
"grad_norm": 0.5054506063461304,
"learning_rate": 2.805755395683453e-06,
"loss": 0.5627,
"step": 39
},
{
"epoch": 0.08639308855291576,
"grad_norm": 0.42791351675987244,
"learning_rate": 2.877697841726619e-06,
"loss": 0.557,
"step": 40
},
{
"epoch": 0.08855291576673865,
"grad_norm": 0.3770763874053955,
"learning_rate": 2.949640287769784e-06,
"loss": 0.5452,
"step": 41
},
{
"epoch": 0.09071274298056156,
"grad_norm": 0.38157957792282104,
"learning_rate": 3.02158273381295e-06,
"loss": 0.5552,
"step": 42
},
{
"epoch": 0.09287257019438445,
"grad_norm": 0.4018012583255768,
"learning_rate": 3.0935251798561158e-06,
"loss": 0.5559,
"step": 43
},
{
"epoch": 0.09503239740820735,
"grad_norm": 0.3959904611110687,
"learning_rate": 3.1654676258992807e-06,
"loss": 0.5493,
"step": 44
},
{
"epoch": 0.09719222462203024,
"grad_norm": 0.38622933626174927,
"learning_rate": 3.237410071942446e-06,
"loss": 0.5512,
"step": 45
},
{
"epoch": 0.09935205183585313,
"grad_norm": 0.3973333239555359,
"learning_rate": 3.309352517985612e-06,
"loss": 0.5413,
"step": 46
},
{
"epoch": 0.10151187904967603,
"grad_norm": 0.3897247910499573,
"learning_rate": 3.381294964028777e-06,
"loss": 0.5223,
"step": 47
},
{
"epoch": 0.10367170626349892,
"grad_norm": 0.37678107619285583,
"learning_rate": 3.453237410071943e-06,
"loss": 0.5296,
"step": 48
},
{
"epoch": 0.10583153347732181,
"grad_norm": 0.33324435353279114,
"learning_rate": 3.525179856115108e-06,
"loss": 0.5184,
"step": 49
},
{
"epoch": 0.1079913606911447,
"grad_norm": 0.303320974111557,
"learning_rate": 3.5971223021582737e-06,
"loss": 0.5331,
"step": 50
},
{
"epoch": 0.1101511879049676,
"grad_norm": 0.30076754093170166,
"learning_rate": 3.669064748201439e-06,
"loss": 0.5331,
"step": 51
},
{
"epoch": 0.11231101511879049,
"grad_norm": 0.2589012086391449,
"learning_rate": 3.741007194244605e-06,
"loss": 0.5109,
"step": 52
},
{
"epoch": 0.11447084233261338,
"grad_norm": 0.2596394121646881,
"learning_rate": 3.81294964028777e-06,
"loss": 0.5227,
"step": 53
},
{
"epoch": 0.11663066954643629,
"grad_norm": 0.255307137966156,
"learning_rate": 3.884892086330936e-06,
"loss": 0.5169,
"step": 54
},
{
"epoch": 0.11879049676025918,
"grad_norm": 0.2433944046497345,
"learning_rate": 3.956834532374101e-06,
"loss": 0.5161,
"step": 55
},
{
"epoch": 0.12095032397408208,
"grad_norm": 0.2333260476589203,
"learning_rate": 4.028776978417267e-06,
"loss": 0.5096,
"step": 56
},
{
"epoch": 0.12311015118790497,
"grad_norm": 0.22751125693321228,
"learning_rate": 4.100719424460432e-06,
"loss": 0.5115,
"step": 57
},
{
"epoch": 0.12526997840172785,
"grad_norm": 0.2149927169084549,
"learning_rate": 4.172661870503597e-06,
"loss": 0.5132,
"step": 58
},
{
"epoch": 0.12742980561555076,
"grad_norm": 0.22358939051628113,
"learning_rate": 4.244604316546763e-06,
"loss": 0.5057,
"step": 59
},
{
"epoch": 0.12958963282937366,
"grad_norm": 0.19954045116901398,
"learning_rate": 4.316546762589928e-06,
"loss": 0.4994,
"step": 60
},
{
"epoch": 0.13174946004319654,
"grad_norm": 0.1936485469341278,
"learning_rate": 4.388489208633094e-06,
"loss": 0.4954,
"step": 61
},
{
"epoch": 0.13390928725701945,
"grad_norm": 0.1977352499961853,
"learning_rate": 4.46043165467626e-06,
"loss": 0.4919,
"step": 62
},
{
"epoch": 0.13606911447084233,
"grad_norm": 0.19697633385658264,
"learning_rate": 4.5323741007194245e-06,
"loss": 0.4895,
"step": 63
},
{
"epoch": 0.13822894168466524,
"grad_norm": 0.2068362534046173,
"learning_rate": 4.60431654676259e-06,
"loss": 0.4848,
"step": 64
},
{
"epoch": 0.14038876889848811,
"grad_norm": 0.2056417018175125,
"learning_rate": 4.676258992805755e-06,
"loss": 0.4901,
"step": 65
},
{
"epoch": 0.14254859611231102,
"grad_norm": 0.20445196330547333,
"learning_rate": 4.748201438848921e-06,
"loss": 0.4986,
"step": 66
},
{
"epoch": 0.1447084233261339,
"grad_norm": 0.17678698897361755,
"learning_rate": 4.820143884892087e-06,
"loss": 0.4784,
"step": 67
},
{
"epoch": 0.1468682505399568,
"grad_norm": 0.17606988549232483,
"learning_rate": 4.892086330935253e-06,
"loss": 0.4818,
"step": 68
},
{
"epoch": 0.1490280777537797,
"grad_norm": 0.1764959990978241,
"learning_rate": 4.9640287769784175e-06,
"loss": 0.4832,
"step": 69
},
{
"epoch": 0.1511879049676026,
"grad_norm": 0.18899278342723846,
"learning_rate": 5.035971223021583e-06,
"loss": 0.4832,
"step": 70
},
{
"epoch": 0.15334773218142547,
"grad_norm": 0.18127930164337158,
"learning_rate": 5.107913669064749e-06,
"loss": 0.4781,
"step": 71
},
{
"epoch": 0.15550755939524838,
"grad_norm": 0.15677423775196075,
"learning_rate": 5.179856115107914e-06,
"loss": 0.4795,
"step": 72
},
{
"epoch": 0.15766738660907129,
"grad_norm": 0.17852047085762024,
"learning_rate": 5.251798561151079e-06,
"loss": 0.4802,
"step": 73
},
{
"epoch": 0.15982721382289417,
"grad_norm": 0.16051283478736877,
"learning_rate": 5.3237410071942456e-06,
"loss": 0.4758,
"step": 74
},
{
"epoch": 0.16198704103671707,
"grad_norm": 0.15272092819213867,
"learning_rate": 5.3956834532374105e-06,
"loss": 0.4742,
"step": 75
},
{
"epoch": 0.16414686825053995,
"grad_norm": 0.18069250881671906,
"learning_rate": 5.467625899280576e-06,
"loss": 0.4788,
"step": 76
},
{
"epoch": 0.16630669546436286,
"grad_norm": 0.18495260179042816,
"learning_rate": 5.539568345323741e-06,
"loss": 0.477,
"step": 77
},
{
"epoch": 0.16846652267818574,
"grad_norm": 0.15244323015213013,
"learning_rate": 5.611510791366906e-06,
"loss": 0.4738,
"step": 78
},
{
"epoch": 0.17062634989200864,
"grad_norm": 0.15029869973659515,
"learning_rate": 5.683453237410073e-06,
"loss": 0.4809,
"step": 79
},
{
"epoch": 0.17278617710583152,
"grad_norm": 0.15908615291118622,
"learning_rate": 5.755395683453238e-06,
"loss": 0.4682,
"step": 80
},
{
"epoch": 0.17494600431965443,
"grad_norm": 0.16395969688892365,
"learning_rate": 5.8273381294964035e-06,
"loss": 0.4786,
"step": 81
},
{
"epoch": 0.1771058315334773,
"grad_norm": 0.15997102856636047,
"learning_rate": 5.899280575539568e-06,
"loss": 0.4728,
"step": 82
},
{
"epoch": 0.17926565874730022,
"grad_norm": 0.15442821383476257,
"learning_rate": 5.971223021582734e-06,
"loss": 0.4693,
"step": 83
},
{
"epoch": 0.18142548596112312,
"grad_norm": 0.17457455396652222,
"learning_rate": 6.0431654676259e-06,
"loss": 0.4535,
"step": 84
},
{
"epoch": 0.183585313174946,
"grad_norm": 0.17761239409446716,
"learning_rate": 6.115107913669065e-06,
"loss": 0.4615,
"step": 85
},
{
"epoch": 0.1857451403887689,
"grad_norm": 0.15749000012874603,
"learning_rate": 6.1870503597122315e-06,
"loss": 0.4757,
"step": 86
},
{
"epoch": 0.1879049676025918,
"grad_norm": 0.1500880867242813,
"learning_rate": 6.2589928057553964e-06,
"loss": 0.468,
"step": 87
},
{
"epoch": 0.1900647948164147,
"grad_norm": 0.16475360095500946,
"learning_rate": 6.330935251798561e-06,
"loss": 0.453,
"step": 88
},
{
"epoch": 0.19222462203023757,
"grad_norm": 0.15528172254562378,
"learning_rate": 6.402877697841727e-06,
"loss": 0.4606,
"step": 89
},
{
"epoch": 0.19438444924406048,
"grad_norm": 0.18330231308937073,
"learning_rate": 6.474820143884892e-06,
"loss": 0.4645,
"step": 90
},
{
"epoch": 0.19654427645788336,
"grad_norm": 0.15349973738193512,
"learning_rate": 6.546762589928059e-06,
"loss": 0.4589,
"step": 91
},
{
"epoch": 0.19870410367170627,
"grad_norm": 0.17889103293418884,
"learning_rate": 6.618705035971224e-06,
"loss": 0.4698,
"step": 92
},
{
"epoch": 0.20086393088552915,
"grad_norm": 0.16917382180690765,
"learning_rate": 6.6906474820143886e-06,
"loss": 0.45,
"step": 93
},
{
"epoch": 0.20302375809935205,
"grad_norm": 0.15472815930843353,
"learning_rate": 6.762589928057554e-06,
"loss": 0.4554,
"step": 94
},
{
"epoch": 0.20518358531317496,
"grad_norm": 0.15166456997394562,
"learning_rate": 6.834532374100719e-06,
"loss": 0.4603,
"step": 95
},
{
"epoch": 0.20734341252699784,
"grad_norm": 0.15480853617191315,
"learning_rate": 6.906474820143886e-06,
"loss": 0.4527,
"step": 96
},
{
"epoch": 0.20950323974082075,
"grad_norm": 0.18076568841934204,
"learning_rate": 6.978417266187051e-06,
"loss": 0.4543,
"step": 97
},
{
"epoch": 0.21166306695464362,
"grad_norm": 0.14898645877838135,
"learning_rate": 7.050359712230216e-06,
"loss": 0.4645,
"step": 98
},
{
"epoch": 0.21382289416846653,
"grad_norm": 0.16191677749156952,
"learning_rate": 7.122302158273382e-06,
"loss": 0.4556,
"step": 99
},
{
"epoch": 0.2159827213822894,
"grad_norm": 0.15693144500255585,
"learning_rate": 7.194244604316547e-06,
"loss": 0.4636,
"step": 100
},
{
"epoch": 0.21814254859611232,
"grad_norm": 0.1577419489622116,
"learning_rate": 7.266187050359713e-06,
"loss": 0.445,
"step": 101
},
{
"epoch": 0.2203023758099352,
"grad_norm": 0.1567850261926651,
"learning_rate": 7.338129496402878e-06,
"loss": 0.4579,
"step": 102
},
{
"epoch": 0.2224622030237581,
"grad_norm": 0.15102896094322205,
"learning_rate": 7.410071942446043e-06,
"loss": 0.4381,
"step": 103
},
{
"epoch": 0.22462203023758098,
"grad_norm": 0.18107765913009644,
"learning_rate": 7.48201438848921e-06,
"loss": 0.4479,
"step": 104
},
{
"epoch": 0.2267818574514039,
"grad_norm": 0.15492849051952362,
"learning_rate": 7.5539568345323745e-06,
"loss": 0.4466,
"step": 105
},
{
"epoch": 0.22894168466522677,
"grad_norm": 0.16862063109874725,
"learning_rate": 7.62589928057554e-06,
"loss": 0.4556,
"step": 106
},
{
"epoch": 0.23110151187904968,
"grad_norm": 0.1701633483171463,
"learning_rate": 7.697841726618706e-06,
"loss": 0.4483,
"step": 107
},
{
"epoch": 0.23326133909287258,
"grad_norm": 0.18902191519737244,
"learning_rate": 7.769784172661872e-06,
"loss": 0.4383,
"step": 108
},
{
"epoch": 0.23542116630669546,
"grad_norm": 0.16331182420253754,
"learning_rate": 7.841726618705036e-06,
"loss": 0.4438,
"step": 109
},
{
"epoch": 0.23758099352051837,
"grad_norm": 0.18327923119068146,
"learning_rate": 7.913669064748202e-06,
"loss": 0.4535,
"step": 110
},
{
"epoch": 0.23974082073434125,
"grad_norm": 0.16586214303970337,
"learning_rate": 7.985611510791367e-06,
"loss": 0.452,
"step": 111
},
{
"epoch": 0.24190064794816415,
"grad_norm": 0.1756211370229721,
"learning_rate": 8.057553956834533e-06,
"loss": 0.4461,
"step": 112
},
{
"epoch": 0.24406047516198703,
"grad_norm": 0.17397738993167877,
"learning_rate": 8.129496402877699e-06,
"loss": 0.4444,
"step": 113
},
{
"epoch": 0.24622030237580994,
"grad_norm": 0.1517469584941864,
"learning_rate": 8.201438848920865e-06,
"loss": 0.4423,
"step": 114
},
{
"epoch": 0.24838012958963282,
"grad_norm": 0.15296703577041626,
"learning_rate": 8.273381294964029e-06,
"loss": 0.4434,
"step": 115
},
{
"epoch": 0.2505399568034557,
"grad_norm": 0.17677851021289825,
"learning_rate": 8.345323741007195e-06,
"loss": 0.4352,
"step": 116
},
{
"epoch": 0.2526997840172786,
"grad_norm": 0.1546233892440796,
"learning_rate": 8.41726618705036e-06,
"loss": 0.4416,
"step": 117
},
{
"epoch": 0.2548596112311015,
"grad_norm": 0.17565761506557465,
"learning_rate": 8.489208633093526e-06,
"loss": 0.4484,
"step": 118
},
{
"epoch": 0.2570194384449244,
"grad_norm": 0.1443185657262802,
"learning_rate": 8.561151079136692e-06,
"loss": 0.4291,
"step": 119
},
{
"epoch": 0.2591792656587473,
"grad_norm": 0.17720922827720642,
"learning_rate": 8.633093525179856e-06,
"loss": 0.4356,
"step": 120
},
{
"epoch": 0.2613390928725702,
"grad_norm": 0.17487414181232452,
"learning_rate": 8.705035971223022e-06,
"loss": 0.4465,
"step": 121
},
{
"epoch": 0.2634989200863931,
"grad_norm": 0.16723576188087463,
"learning_rate": 8.776978417266188e-06,
"loss": 0.4463,
"step": 122
},
{
"epoch": 0.265658747300216,
"grad_norm": 0.19939404726028442,
"learning_rate": 8.848920863309353e-06,
"loss": 0.4387,
"step": 123
},
{
"epoch": 0.2678185745140389,
"grad_norm": 0.1569490283727646,
"learning_rate": 8.92086330935252e-06,
"loss": 0.4332,
"step": 124
},
{
"epoch": 0.26997840172786175,
"grad_norm": 0.17922881245613098,
"learning_rate": 8.992805755395683e-06,
"loss": 0.4404,
"step": 125
},
{
"epoch": 0.27213822894168466,
"grad_norm": 0.17273768782615662,
"learning_rate": 9.064748201438849e-06,
"loss": 0.4445,
"step": 126
},
{
"epoch": 0.27429805615550756,
"grad_norm": 0.16782942414283752,
"learning_rate": 9.136690647482015e-06,
"loss": 0.4316,
"step": 127
},
{
"epoch": 0.27645788336933047,
"grad_norm": 0.17636790871620178,
"learning_rate": 9.20863309352518e-06,
"loss": 0.4361,
"step": 128
},
{
"epoch": 0.2786177105831533,
"grad_norm": 0.18042488396167755,
"learning_rate": 9.280575539568346e-06,
"loss": 0.4316,
"step": 129
},
{
"epoch": 0.28077753779697623,
"grad_norm": 0.21798282861709595,
"learning_rate": 9.35251798561151e-06,
"loss": 0.4384,
"step": 130
},
{
"epoch": 0.28293736501079914,
"grad_norm": 0.18524324893951416,
"learning_rate": 9.424460431654678e-06,
"loss": 0.4434,
"step": 131
},
{
"epoch": 0.28509719222462204,
"grad_norm": 0.19849282503128052,
"learning_rate": 9.496402877697842e-06,
"loss": 0.4454,
"step": 132
},
{
"epoch": 0.28725701943844495,
"grad_norm": 0.17093205451965332,
"learning_rate": 9.568345323741008e-06,
"loss": 0.4449,
"step": 133
},
{
"epoch": 0.2894168466522678,
"grad_norm": 0.19003981351852417,
"learning_rate": 9.640287769784174e-06,
"loss": 0.4244,
"step": 134
},
{
"epoch": 0.2915766738660907,
"grad_norm": 0.2193020135164261,
"learning_rate": 9.712230215827338e-06,
"loss": 0.434,
"step": 135
},
{
"epoch": 0.2937365010799136,
"grad_norm": 0.19183115661144257,
"learning_rate": 9.784172661870505e-06,
"loss": 0.4259,
"step": 136
},
{
"epoch": 0.2958963282937365,
"grad_norm": 0.17214708030223846,
"learning_rate": 9.85611510791367e-06,
"loss": 0.4433,
"step": 137
},
{
"epoch": 0.2980561555075594,
"grad_norm": 0.16226549446582794,
"learning_rate": 9.928057553956835e-06,
"loss": 0.4389,
"step": 138
},
{
"epoch": 0.3002159827213823,
"grad_norm": 0.17609405517578125,
"learning_rate": 1e-05,
"loss": 0.4387,
"step": 139
},
{
"epoch": 0.3023758099352052,
"grad_norm": 0.15736715495586395,
"learning_rate": 9.999984208641271e-06,
"loss": 0.4324,
"step": 140
},
{
"epoch": 0.3045356371490281,
"grad_norm": 0.2223547101020813,
"learning_rate": 9.99993683466483e-06,
"loss": 0.4245,
"step": 141
},
{
"epoch": 0.30669546436285094,
"grad_norm": 0.17344172298908234,
"learning_rate": 9.999857878369917e-06,
"loss": 0.4302,
"step": 142
},
{
"epoch": 0.30885529157667385,
"grad_norm": 0.16877353191375732,
"learning_rate": 9.99974734025526e-06,
"loss": 0.4497,
"step": 143
},
{
"epoch": 0.31101511879049676,
"grad_norm": 0.1692124605178833,
"learning_rate": 9.999605221019082e-06,
"loss": 0.4414,
"step": 144
},
{
"epoch": 0.31317494600431967,
"grad_norm": 0.18339934945106506,
"learning_rate": 9.999431521559081e-06,
"loss": 0.4392,
"step": 145
},
{
"epoch": 0.31533477321814257,
"grad_norm": 0.19719652831554413,
"learning_rate": 9.999226242972445e-06,
"loss": 0.4331,
"step": 146
},
{
"epoch": 0.3174946004319654,
"grad_norm": 0.14894719421863556,
"learning_rate": 9.998989386555815e-06,
"loss": 0.4344,
"step": 147
},
{
"epoch": 0.31965442764578833,
"grad_norm": 0.20450158417224884,
"learning_rate": 9.998720953805312e-06,
"loss": 0.4397,
"step": 148
},
{
"epoch": 0.32181425485961124,
"grad_norm": 0.1889685094356537,
"learning_rate": 9.9984209464165e-06,
"loss": 0.4297,
"step": 149
},
{
"epoch": 0.32397408207343414,
"grad_norm": 0.16375744342803955,
"learning_rate": 9.998089366284392e-06,
"loss": 0.4228,
"step": 150
},
{
"epoch": 0.326133909287257,
"grad_norm": 0.15281440317630768,
"learning_rate": 9.997726215503422e-06,
"loss": 0.4264,
"step": 151
},
{
"epoch": 0.3282937365010799,
"grad_norm": 0.16808317601680756,
"learning_rate": 9.997331496367455e-06,
"loss": 0.4247,
"step": 152
},
{
"epoch": 0.3304535637149028,
"grad_norm": 0.168230339884758,
"learning_rate": 9.996905211369748e-06,
"loss": 0.4245,
"step": 153
},
{
"epoch": 0.3326133909287257,
"grad_norm": 0.1692979782819748,
"learning_rate": 9.996447363202947e-06,
"loss": 0.4309,
"step": 154
},
{
"epoch": 0.3347732181425486,
"grad_norm": 0.190389946103096,
"learning_rate": 9.995957954759073e-06,
"loss": 0.4239,
"step": 155
},
{
"epoch": 0.3369330453563715,
"grad_norm": 0.18425118923187256,
"learning_rate": 9.995436989129495e-06,
"loss": 0.4316,
"step": 156
},
{
"epoch": 0.3390928725701944,
"grad_norm": 0.1809394657611847,
"learning_rate": 9.994884469604913e-06,
"loss": 0.4276,
"step": 157
},
{
"epoch": 0.3412526997840173,
"grad_norm": 0.20476803183555603,
"learning_rate": 9.994300399675342e-06,
"loss": 0.4375,
"step": 158
},
{
"epoch": 0.3434125269978402,
"grad_norm": 0.16786271333694458,
"learning_rate": 9.99368478303009e-06,
"loss": 0.4352,
"step": 159
},
{
"epoch": 0.34557235421166305,
"grad_norm": 0.16701987385749817,
"learning_rate": 9.993037623557716e-06,
"loss": 0.4193,
"step": 160
},
{
"epoch": 0.34773218142548595,
"grad_norm": 0.19547881186008453,
"learning_rate": 9.99235892534604e-06,
"loss": 0.4264,
"step": 161
},
{
"epoch": 0.34989200863930886,
"grad_norm": 0.16596215963363647,
"learning_rate": 9.991648692682083e-06,
"loss": 0.4347,
"step": 162
},
{
"epoch": 0.35205183585313177,
"grad_norm": 0.1804916262626648,
"learning_rate": 9.990906930052065e-06,
"loss": 0.4168,
"step": 163
},
{
"epoch": 0.3542116630669546,
"grad_norm": 0.16082727909088135,
"learning_rate": 9.990133642141359e-06,
"loss": 0.4281,
"step": 164
},
{
"epoch": 0.3563714902807775,
"grad_norm": 0.180884450674057,
"learning_rate": 9.989328833834472e-06,
"loss": 0.4318,
"step": 165
},
{
"epoch": 0.35853131749460043,
"grad_norm": 0.16864454746246338,
"learning_rate": 9.988492510215011e-06,
"loss": 0.4306,
"step": 166
},
{
"epoch": 0.36069114470842334,
"grad_norm": 0.17244219779968262,
"learning_rate": 9.987624676565652e-06,
"loss": 0.4282,
"step": 167
},
{
"epoch": 0.36285097192224625,
"grad_norm": 0.1679103672504425,
"learning_rate": 9.986725338368103e-06,
"loss": 0.4195,
"step": 168
},
{
"epoch": 0.3650107991360691,
"grad_norm": 0.16607263684272766,
"learning_rate": 9.98579450130307e-06,
"loss": 0.4288,
"step": 169
},
{
"epoch": 0.367170626349892,
"grad_norm": 0.16679252684116364,
"learning_rate": 9.98483217125023e-06,
"loss": 0.418,
"step": 170
},
{
"epoch": 0.3693304535637149,
"grad_norm": 0.15752755105495453,
"learning_rate": 9.983838354288181e-06,
"loss": 0.438,
"step": 171
},
{
"epoch": 0.3714902807775378,
"grad_norm": 0.1747094839811325,
"learning_rate": 9.982813056694411e-06,
"loss": 0.4316,
"step": 172
},
{
"epoch": 0.37365010799136067,
"grad_norm": 0.1854209452867508,
"learning_rate": 9.981756284945256e-06,
"loss": 0.424,
"step": 173
},
{
"epoch": 0.3758099352051836,
"grad_norm": 0.1754128485918045,
"learning_rate": 9.980668045715864e-06,
"loss": 0.4115,
"step": 174
},
{
"epoch": 0.3779697624190065,
"grad_norm": 0.17791932821273804,
"learning_rate": 9.979548345880142e-06,
"loss": 0.4272,
"step": 175
},
{
"epoch": 0.3801295896328294,
"grad_norm": 0.15502074360847473,
"learning_rate": 9.978397192510722e-06,
"loss": 0.4161,
"step": 176
},
{
"epoch": 0.38228941684665224,
"grad_norm": 0.1928102672100067,
"learning_rate": 9.977214592878917e-06,
"loss": 0.4202,
"step": 177
},
{
"epoch": 0.38444924406047515,
"grad_norm": 0.1752733737230301,
"learning_rate": 9.976000554454668e-06,
"loss": 0.4251,
"step": 178
},
{
"epoch": 0.38660907127429806,
"grad_norm": 0.15899012982845306,
"learning_rate": 9.974755084906503e-06,
"loss": 0.4212,
"step": 179
},
{
"epoch": 0.38876889848812096,
"grad_norm": 0.20840278267860413,
"learning_rate": 9.97347819210148e-06,
"loss": 0.4193,
"step": 180
},
{
"epoch": 0.39092872570194387,
"grad_norm": 0.16049212217330933,
"learning_rate": 9.972169884105155e-06,
"loss": 0.4222,
"step": 181
},
{
"epoch": 0.3930885529157667,
"grad_norm": 0.21681340038776398,
"learning_rate": 9.970830169181504e-06,
"loss": 0.4221,
"step": 182
},
{
"epoch": 0.3952483801295896,
"grad_norm": 0.20257696509361267,
"learning_rate": 9.969459055792903e-06,
"loss": 0.412,
"step": 183
},
{
"epoch": 0.39740820734341253,
"grad_norm": 0.1784621775150299,
"learning_rate": 9.968056552600043e-06,
"loss": 0.4308,
"step": 184
},
{
"epoch": 0.39956803455723544,
"grad_norm": 0.21011000871658325,
"learning_rate": 9.966622668461899e-06,
"loss": 0.4196,
"step": 185
},
{
"epoch": 0.4017278617710583,
"grad_norm": 0.17967236042022705,
"learning_rate": 9.965157412435663e-06,
"loss": 0.4171,
"step": 186
},
{
"epoch": 0.4038876889848812,
"grad_norm": 0.21680930256843567,
"learning_rate": 9.963660793776689e-06,
"loss": 0.4188,
"step": 187
},
{
"epoch": 0.4060475161987041,
"grad_norm": 0.1738550364971161,
"learning_rate": 9.96213282193843e-06,
"loss": 0.4207,
"step": 188
},
{
"epoch": 0.408207343412527,
"grad_norm": 0.1727888137102127,
"learning_rate": 9.960573506572391e-06,
"loss": 0.4244,
"step": 189
},
{
"epoch": 0.4103671706263499,
"grad_norm": 0.19244728982448578,
"learning_rate": 9.958982857528053e-06,
"loss": 0.4162,
"step": 190
},
{
"epoch": 0.41252699784017277,
"grad_norm": 0.1902923285961151,
"learning_rate": 9.957360884852819e-06,
"loss": 0.4272,
"step": 191
},
{
"epoch": 0.4146868250539957,
"grad_norm": 0.15449483692646027,
"learning_rate": 9.955707598791952e-06,
"loss": 0.4103,
"step": 192
},
{
"epoch": 0.4168466522678186,
"grad_norm": 0.16577541828155518,
"learning_rate": 9.954023009788505e-06,
"loss": 0.4262,
"step": 193
},
{
"epoch": 0.4190064794816415,
"grad_norm": 0.17047494649887085,
"learning_rate": 9.952307128483257e-06,
"loss": 0.416,
"step": 194
},
{
"epoch": 0.42116630669546434,
"grad_norm": 0.16605447232723236,
"learning_rate": 9.950559965714647e-06,
"loss": 0.4118,
"step": 195
},
{
"epoch": 0.42332613390928725,
"grad_norm": 0.17296399176120758,
"learning_rate": 9.948781532518706e-06,
"loss": 0.415,
"step": 196
},
{
"epoch": 0.42548596112311016,
"grad_norm": 0.16557732224464417,
"learning_rate": 9.946971840128982e-06,
"loss": 0.399,
"step": 197
},
{
"epoch": 0.42764578833693306,
"grad_norm": 0.1601681262254715,
"learning_rate": 9.945130899976477e-06,
"loss": 0.4091,
"step": 198
},
{
"epoch": 0.4298056155507559,
"grad_norm": 0.17228373885154724,
"learning_rate": 9.94325872368957e-06,
"loss": 0.4169,
"step": 199
},
{
"epoch": 0.4319654427645788,
"grad_norm": 0.1871252954006195,
"learning_rate": 9.941355323093944e-06,
"loss": 0.4064,
"step": 200
},
{
"epoch": 0.43412526997840173,
"grad_norm": 0.16557608544826508,
"learning_rate": 9.939420710212511e-06,
"loss": 0.4022,
"step": 201
},
{
"epoch": 0.43628509719222464,
"grad_norm": 0.19201870262622833,
"learning_rate": 9.937454897265338e-06,
"loss": 0.4106,
"step": 202
},
{
"epoch": 0.43844492440604754,
"grad_norm": 0.20729224383831024,
"learning_rate": 9.935457896669568e-06,
"loss": 0.4231,
"step": 203
},
{
"epoch": 0.4406047516198704,
"grad_norm": 0.1870211958885193,
"learning_rate": 9.93342972103934e-06,
"loss": 0.4048,
"step": 204
},
{
"epoch": 0.4427645788336933,
"grad_norm": 0.17580384016036987,
"learning_rate": 9.931370383185717e-06,
"loss": 0.4088,
"step": 205
},
{
"epoch": 0.4449244060475162,
"grad_norm": 0.20925194025039673,
"learning_rate": 9.929279896116595e-06,
"loss": 0.4148,
"step": 206
},
{
"epoch": 0.4470842332613391,
"grad_norm": 0.2229665368795395,
"learning_rate": 9.927158273036624e-06,
"loss": 0.4185,
"step": 207
},
{
"epoch": 0.44924406047516197,
"grad_norm": 0.1665569692850113,
"learning_rate": 9.925005527347132e-06,
"loss": 0.4137,
"step": 208
},
{
"epoch": 0.4514038876889849,
"grad_norm": 0.18300025165081024,
"learning_rate": 9.922821672646028e-06,
"loss": 0.4098,
"step": 209
},
{
"epoch": 0.4535637149028078,
"grad_norm": 0.21622765064239502,
"learning_rate": 9.920606722727726e-06,
"loss": 0.413,
"step": 210
},
{
"epoch": 0.4557235421166307,
"grad_norm": 0.1885174661874771,
"learning_rate": 9.918360691583056e-06,
"loss": 0.4156,
"step": 211
},
{
"epoch": 0.45788336933045354,
"grad_norm": 0.20590178668498993,
"learning_rate": 9.916083593399167e-06,
"loss": 0.4192,
"step": 212
},
{
"epoch": 0.46004319654427644,
"grad_norm": 0.19168834388256073,
"learning_rate": 9.913775442559451e-06,
"loss": 0.42,
"step": 213
},
{
"epoch": 0.46220302375809935,
"grad_norm": 0.2033228576183319,
"learning_rate": 9.911436253643445e-06,
"loss": 0.4294,
"step": 214
},
{
"epoch": 0.46436285097192226,
"grad_norm": 0.1603459119796753,
"learning_rate": 9.909066041426733e-06,
"loss": 0.4257,
"step": 215
},
{
"epoch": 0.46652267818574517,
"grad_norm": 0.17825163900852203,
"learning_rate": 9.906664820880869e-06,
"loss": 0.4196,
"step": 216
},
{
"epoch": 0.468682505399568,
"grad_norm": 0.19199080765247345,
"learning_rate": 9.904232607173262e-06,
"loss": 0.4213,
"step": 217
},
{
"epoch": 0.4708423326133909,
"grad_norm": 0.16068002581596375,
"learning_rate": 9.9017694156671e-06,
"loss": 0.4178,
"step": 218
},
{
"epoch": 0.47300215982721383,
"grad_norm": 0.18598708510398865,
"learning_rate": 9.899275261921236e-06,
"loss": 0.4119,
"step": 219
},
{
"epoch": 0.47516198704103674,
"grad_norm": 0.17735686898231506,
"learning_rate": 9.8967501616901e-06,
"loss": 0.4159,
"step": 220
},
{
"epoch": 0.4773218142548596,
"grad_norm": 0.20054501295089722,
"learning_rate": 9.894194130923602e-06,
"loss": 0.4228,
"step": 221
},
{
"epoch": 0.4794816414686825,
"grad_norm": 0.1531924605369568,
"learning_rate": 9.891607185767018e-06,
"loss": 0.4182,
"step": 222
},
{
"epoch": 0.4816414686825054,
"grad_norm": 0.20048613846302032,
"learning_rate": 9.8889893425609e-06,
"loss": 0.4184,
"step": 223
},
{
"epoch": 0.4838012958963283,
"grad_norm": 0.16016018390655518,
"learning_rate": 9.886340617840968e-06,
"loss": 0.4162,
"step": 224
},
{
"epoch": 0.48596112311015116,
"grad_norm": 0.17939400672912598,
"learning_rate": 9.883661028338009e-06,
"loss": 0.4216,
"step": 225
},
{
"epoch": 0.48812095032397407,
"grad_norm": 0.17419300973415375,
"learning_rate": 9.880950590977764e-06,
"loss": 0.4165,
"step": 226
},
{
"epoch": 0.490280777537797,
"grad_norm": 0.18040290474891663,
"learning_rate": 9.87820932288083e-06,
"loss": 0.4148,
"step": 227
},
{
"epoch": 0.4924406047516199,
"grad_norm": 0.2009221911430359,
"learning_rate": 9.875437241362546e-06,
"loss": 0.4088,
"step": 228
},
{
"epoch": 0.4946004319654428,
"grad_norm": 0.16184359788894653,
"learning_rate": 9.872634363932887e-06,
"loss": 0.4246,
"step": 229
},
{
"epoch": 0.49676025917926564,
"grad_norm": 0.19945880770683289,
"learning_rate": 9.869800708296347e-06,
"loss": 0.415,
"step": 230
},
{
"epoch": 0.49892008639308855,
"grad_norm": 0.1834121197462082,
"learning_rate": 9.866936292351837e-06,
"loss": 0.413,
"step": 231
},
{
"epoch": 0.5010799136069114,
"grad_norm": 0.19155217707157135,
"learning_rate": 9.864041134192563e-06,
"loss": 0.4145,
"step": 232
},
{
"epoch": 0.5032397408207343,
"grad_norm": 0.16527444124221802,
"learning_rate": 9.861115252105922e-06,
"loss": 0.411,
"step": 233
},
{
"epoch": 0.5053995680345572,
"grad_norm": 0.2018229365348816,
"learning_rate": 9.85815866457337e-06,
"loss": 0.4144,
"step": 234
},
{
"epoch": 0.5075593952483801,
"grad_norm": 0.18045048415660858,
"learning_rate": 9.855171390270325e-06,
"loss": 0.4173,
"step": 235
},
{
"epoch": 0.509719222462203,
"grad_norm": 0.16335050761699677,
"learning_rate": 9.852153448066031e-06,
"loss": 0.4184,
"step": 236
},
{
"epoch": 0.5118790496760259,
"grad_norm": 0.1876971423625946,
"learning_rate": 9.849104857023455e-06,
"loss": 0.4149,
"step": 237
},
{
"epoch": 0.5140388768898488,
"grad_norm": 0.1632338911294937,
"learning_rate": 9.846025636399152e-06,
"loss": 0.4281,
"step": 238
},
{
"epoch": 0.5161987041036717,
"grad_norm": 0.1685461848974228,
"learning_rate": 9.842915805643156e-06,
"loss": 0.4168,
"step": 239
},
{
"epoch": 0.5183585313174947,
"grad_norm": 0.1753380447626114,
"learning_rate": 9.839775384398846e-06,
"loss": 0.4163,
"step": 240
},
{
"epoch": 0.5205183585313174,
"grad_norm": 0.17196574807167053,
"learning_rate": 9.836604392502829e-06,
"loss": 0.4264,
"step": 241
},
{
"epoch": 0.5226781857451404,
"grad_norm": 0.22572582960128784,
"learning_rate": 9.833402849984815e-06,
"loss": 0.4116,
"step": 242
},
{
"epoch": 0.5248380129589633,
"grad_norm": 0.16782043874263763,
"learning_rate": 9.830170777067486e-06,
"loss": 0.416,
"step": 243
},
{
"epoch": 0.5269978401727862,
"grad_norm": 0.1964120864868164,
"learning_rate": 9.82690819416637e-06,
"loss": 0.4189,
"step": 244
},
{
"epoch": 0.5291576673866091,
"grad_norm": 0.16382494568824768,
"learning_rate": 9.823615121889716e-06,
"loss": 0.4216,
"step": 245
},
{
"epoch": 0.531317494600432,
"grad_norm": 0.19145262241363525,
"learning_rate": 9.820291581038354e-06,
"loss": 0.4069,
"step": 246
},
{
"epoch": 0.5334773218142549,
"grad_norm": 0.1793445199728012,
"learning_rate": 9.81693759260558e-06,
"loss": 0.4073,
"step": 247
},
{
"epoch": 0.5356371490280778,
"grad_norm": 0.20790617167949677,
"learning_rate": 9.813553177777005e-06,
"loss": 0.4098,
"step": 248
},
{
"epoch": 0.5377969762419006,
"grad_norm": 0.17739000916481018,
"learning_rate": 9.81013835793043e-06,
"loss": 0.416,
"step": 249
},
{
"epoch": 0.5399568034557235,
"grad_norm": 0.1868736743927002,
"learning_rate": 9.806693154635719e-06,
"loss": 0.4192,
"step": 250
},
{
"epoch": 0.5421166306695464,
"grad_norm": 0.2077651023864746,
"learning_rate": 9.803217589654642e-06,
"loss": 0.4001,
"step": 251
},
{
"epoch": 0.5442764578833693,
"grad_norm": 0.17842766642570496,
"learning_rate": 9.79971168494076e-06,
"loss": 0.4122,
"step": 252
},
{
"epoch": 0.5464362850971922,
"grad_norm": 0.20299148559570312,
"learning_rate": 9.796175462639273e-06,
"loss": 0.4164,
"step": 253
},
{
"epoch": 0.5485961123110151,
"grad_norm": 0.20451399683952332,
"learning_rate": 9.79260894508688e-06,
"loss": 0.4194,
"step": 254
},
{
"epoch": 0.550755939524838,
"grad_norm": 0.16164958477020264,
"learning_rate": 9.789012154811648e-06,
"loss": 0.4037,
"step": 255
},
{
"epoch": 0.5529157667386609,
"grad_norm": 0.19269876182079315,
"learning_rate": 9.785385114532858e-06,
"loss": 0.4086,
"step": 256
},
{
"epoch": 0.5550755939524838,
"grad_norm": 0.22143694758415222,
"learning_rate": 9.781727847160865e-06,
"loss": 0.4205,
"step": 257
},
{
"epoch": 0.5572354211663066,
"grad_norm": 0.20241893827915192,
"learning_rate": 9.77804037579696e-06,
"loss": 0.4135,
"step": 258
},
{
"epoch": 0.5593952483801296,
"grad_norm": 0.19745588302612305,
"learning_rate": 9.774322723733216e-06,
"loss": 0.4129,
"step": 259
},
{
"epoch": 0.5615550755939525,
"grad_norm": 0.1914190798997879,
"learning_rate": 9.770574914452343e-06,
"loss": 0.4153,
"step": 260
},
{
"epoch": 0.5637149028077754,
"grad_norm": 0.18239063024520874,
"learning_rate": 9.766796971627543e-06,
"loss": 0.4183,
"step": 261
},
{
"epoch": 0.5658747300215983,
"grad_norm": 0.18472230434417725,
"learning_rate": 9.762988919122354e-06,
"loss": 0.4129,
"step": 262
},
{
"epoch": 0.5680345572354212,
"grad_norm": 0.18945036828517914,
"learning_rate": 9.759150780990508e-06,
"loss": 0.4145,
"step": 263
},
{
"epoch": 0.5701943844492441,
"grad_norm": 0.18478325009346008,
"learning_rate": 9.755282581475769e-06,
"loss": 0.4057,
"step": 264
},
{
"epoch": 0.572354211663067,
"grad_norm": 0.21743497252464294,
"learning_rate": 9.751384345011787e-06,
"loss": 0.4161,
"step": 265
},
{
"epoch": 0.5745140388768899,
"grad_norm": 0.17114116251468658,
"learning_rate": 9.747456096221946e-06,
"loss": 0.4007,
"step": 266
},
{
"epoch": 0.5766738660907127,
"grad_norm": 0.187269389629364,
"learning_rate": 9.743497859919196e-06,
"loss": 0.4048,
"step": 267
},
{
"epoch": 0.5788336933045356,
"grad_norm": 0.16859321296215057,
"learning_rate": 9.739509661105912e-06,
"loss": 0.4109,
"step": 268
},
{
"epoch": 0.5809935205183585,
"grad_norm": 0.1999719887971878,
"learning_rate": 9.735491524973723e-06,
"loss": 0.3952,
"step": 269
},
{
"epoch": 0.5831533477321814,
"grad_norm": 0.176213800907135,
"learning_rate": 9.73144347690336e-06,
"loss": 0.4177,
"step": 270
},
{
"epoch": 0.5853131749460043,
"grad_norm": 0.1951010376214981,
"learning_rate": 9.727365542464498e-06,
"loss": 0.4164,
"step": 271
},
{
"epoch": 0.5874730021598272,
"grad_norm": 0.17570029199123383,
"learning_rate": 9.723257747415584e-06,
"loss": 0.4094,
"step": 272
},
{
"epoch": 0.5896328293736501,
"grad_norm": 0.179957315325737,
"learning_rate": 9.719120117703688e-06,
"loss": 0.406,
"step": 273
},
{
"epoch": 0.591792656587473,
"grad_norm": 0.17086850106716156,
"learning_rate": 9.714952679464324e-06,
"loss": 0.403,
"step": 274
},
{
"epoch": 0.593952483801296,
"grad_norm": 0.17228035628795624,
"learning_rate": 9.710755459021297e-06,
"loss": 0.4109,
"step": 275
},
{
"epoch": 0.5961123110151187,
"grad_norm": 0.19265015423297882,
"learning_rate": 9.706528482886535e-06,
"loss": 0.4209,
"step": 276
},
{
"epoch": 0.5982721382289417,
"grad_norm": 0.16357901692390442,
"learning_rate": 9.702271777759915e-06,
"loss": 0.4061,
"step": 277
},
{
"epoch": 0.6004319654427646,
"grad_norm": 0.17083071172237396,
"learning_rate": 9.697985370529101e-06,
"loss": 0.3996,
"step": 278
},
{
"epoch": 0.6025917926565875,
"grad_norm": 0.1811821013689041,
"learning_rate": 9.693669288269371e-06,
"loss": 0.4182,
"step": 279
},
{
"epoch": 0.6047516198704104,
"grad_norm": 0.1488206833600998,
"learning_rate": 9.689323558243446e-06,
"loss": 0.3981,
"step": 280
},
{
"epoch": 0.6069114470842333,
"grad_norm": 0.185231015086174,
"learning_rate": 9.684948207901315e-06,
"loss": 0.4132,
"step": 281
},
{
"epoch": 0.6090712742980562,
"grad_norm": 0.14964009821414948,
"learning_rate": 9.680543264880075e-06,
"loss": 0.4098,
"step": 282
},
{
"epoch": 0.6112311015118791,
"grad_norm": 0.16769437491893768,
"learning_rate": 9.676108757003735e-06,
"loss": 0.418,
"step": 283
},
{
"epoch": 0.6133909287257019,
"grad_norm": 0.1763710230588913,
"learning_rate": 9.671644712283061e-06,
"loss": 0.4111,
"step": 284
},
{
"epoch": 0.6155507559395248,
"grad_norm": 0.17033055424690247,
"learning_rate": 9.667151158915382e-06,
"loss": 0.4138,
"step": 285
},
{
"epoch": 0.6177105831533477,
"grad_norm": 0.21479171514511108,
"learning_rate": 9.662628125284426e-06,
"loss": 0.4164,
"step": 286
},
{
"epoch": 0.6198704103671706,
"grad_norm": 0.18288952112197876,
"learning_rate": 9.65807563996013e-06,
"loss": 0.416,
"step": 287
},
{
"epoch": 0.6220302375809935,
"grad_norm": 0.20399482548236847,
"learning_rate": 9.653493731698467e-06,
"loss": 0.4145,
"step": 288
},
{
"epoch": 0.6241900647948164,
"grad_norm": 0.19287261366844177,
"learning_rate": 9.648882429441258e-06,
"loss": 0.4131,
"step": 289
},
{
"epoch": 0.6263498920086393,
"grad_norm": 0.17563579976558685,
"learning_rate": 9.644241762315995e-06,
"loss": 0.4097,
"step": 290
},
{
"epoch": 0.6285097192224622,
"grad_norm": 0.18624839186668396,
"learning_rate": 9.639571759635655e-06,
"loss": 0.4176,
"step": 291
},
{
"epoch": 0.6306695464362851,
"grad_norm": 0.18379148840904236,
"learning_rate": 9.634872450898511e-06,
"loss": 0.4035,
"step": 292
},
{
"epoch": 0.6328293736501079,
"grad_norm": 0.1886526644229889,
"learning_rate": 9.630143865787951e-06,
"loss": 0.4068,
"step": 293
},
{
"epoch": 0.6349892008639308,
"grad_norm": 0.16463734209537506,
"learning_rate": 9.62538603417229e-06,
"loss": 0.4163,
"step": 294
},
{
"epoch": 0.6371490280777538,
"grad_norm": 0.1974654197692871,
"learning_rate": 9.620598986104578e-06,
"loss": 0.4039,
"step": 295
},
{
"epoch": 0.6393088552915767,
"grad_norm": 0.1882481724023819,
"learning_rate": 9.615782751822413e-06,
"loss": 0.4115,
"step": 296
},
{
"epoch": 0.6414686825053996,
"grad_norm": 0.15222138166427612,
"learning_rate": 9.610937361747747e-06,
"loss": 0.4045,
"step": 297
},
{
"epoch": 0.6436285097192225,
"grad_norm": 0.17053523659706116,
"learning_rate": 9.606062846486698e-06,
"loss": 0.4119,
"step": 298
},
{
"epoch": 0.6457883369330454,
"grad_norm": 0.15987005829811096,
"learning_rate": 9.601159236829353e-06,
"loss": 0.3964,
"step": 299
},
{
"epoch": 0.6479481641468683,
"grad_norm": 0.16534611582756042,
"learning_rate": 9.596226563749575e-06,
"loss": 0.4115,
"step": 300
},
{
"epoch": 0.6501079913606912,
"grad_norm": 0.1743890643119812,
"learning_rate": 9.591264858404809e-06,
"loss": 0.4241,
"step": 301
},
{
"epoch": 0.652267818574514,
"grad_norm": 0.14473925530910492,
"learning_rate": 9.586274152135883e-06,
"loss": 0.4011,
"step": 302
},
{
"epoch": 0.6544276457883369,
"grad_norm": 0.1717105656862259,
"learning_rate": 9.58125447646681e-06,
"loss": 0.4128,
"step": 303
},
{
"epoch": 0.6565874730021598,
"grad_norm": 0.16893403232097626,
"learning_rate": 9.576205863104588e-06,
"loss": 0.3984,
"step": 304
},
{
"epoch": 0.6587473002159827,
"grad_norm": 0.19387434422969818,
"learning_rate": 9.571128343939006e-06,
"loss": 0.4086,
"step": 305
},
{
"epoch": 0.6609071274298056,
"grad_norm": 0.1532067507505417,
"learning_rate": 9.566021951042432e-06,
"loss": 0.413,
"step": 306
},
{
"epoch": 0.6630669546436285,
"grad_norm": 0.19082939624786377,
"learning_rate": 9.56088671666962e-06,
"loss": 0.4028,
"step": 307
},
{
"epoch": 0.6652267818574514,
"grad_norm": 0.1660735309123993,
"learning_rate": 9.555722673257502e-06,
"loss": 0.4048,
"step": 308
},
{
"epoch": 0.6673866090712743,
"grad_norm": 0.1646290272474289,
"learning_rate": 9.550529853424979e-06,
"loss": 0.401,
"step": 309
},
{
"epoch": 0.6695464362850972,
"grad_norm": 0.1946524977684021,
"learning_rate": 9.545308289972727e-06,
"loss": 0.3999,
"step": 310
},
{
"epoch": 0.67170626349892,
"grad_norm": 0.1731284260749817,
"learning_rate": 9.54005801588298e-06,
"loss": 0.4056,
"step": 311
},
{
"epoch": 0.673866090712743,
"grad_norm": 0.1577766239643097,
"learning_rate": 9.534779064319318e-06,
"loss": 0.3952,
"step": 312
},
{
"epoch": 0.6760259179265659,
"grad_norm": 0.20560896396636963,
"learning_rate": 9.529471468626472e-06,
"loss": 0.4082,
"step": 313
},
{
"epoch": 0.6781857451403888,
"grad_norm": 0.16146545112133026,
"learning_rate": 9.524135262330098e-06,
"loss": 0.4044,
"step": 314
},
{
"epoch": 0.6803455723542117,
"grad_norm": 0.18924373388290405,
"learning_rate": 9.51877047913658e-06,
"loss": 0.3949,
"step": 315
},
{
"epoch": 0.6825053995680346,
"grad_norm": 0.20120824873447418,
"learning_rate": 9.513377152932796e-06,
"loss": 0.4098,
"step": 316
},
{
"epoch": 0.6846652267818575,
"grad_norm": 0.17236529290676117,
"learning_rate": 9.507955317785935e-06,
"loss": 0.4005,
"step": 317
},
{
"epoch": 0.6868250539956804,
"grad_norm": 0.19479617476463318,
"learning_rate": 9.502505007943248e-06,
"loss": 0.4115,
"step": 318
},
{
"epoch": 0.6889848812095032,
"grad_norm": 0.18137769401073456,
"learning_rate": 9.497026257831856e-06,
"loss": 0.4006,
"step": 319
},
{
"epoch": 0.6911447084233261,
"grad_norm": 0.18386590480804443,
"learning_rate": 9.491519102058523e-06,
"loss": 0.4045,
"step": 320
},
{
"epoch": 0.693304535637149,
"grad_norm": 0.18597717583179474,
"learning_rate": 9.48598357540944e-06,
"loss": 0.3974,
"step": 321
},
{
"epoch": 0.6954643628509719,
"grad_norm": 0.19069334864616394,
"learning_rate": 9.480419712849996e-06,
"loss": 0.4139,
"step": 322
},
{
"epoch": 0.6976241900647948,
"grad_norm": 0.18793267011642456,
"learning_rate": 9.474827549524574e-06,
"loss": 0.4105,
"step": 323
},
{
"epoch": 0.6997840172786177,
"grad_norm": 0.19101367890834808,
"learning_rate": 9.46920712075632e-06,
"loss": 0.3988,
"step": 324
},
{
"epoch": 0.7019438444924406,
"grad_norm": 0.15915150940418243,
"learning_rate": 9.463558462046912e-06,
"loss": 0.4052,
"step": 325
},
{
"epoch": 0.7041036717062635,
"grad_norm": 0.20149967074394226,
"learning_rate": 9.457881609076352e-06,
"loss": 0.4039,
"step": 326
},
{
"epoch": 0.7062634989200864,
"grad_norm": 0.1692509800195694,
"learning_rate": 9.452176597702724e-06,
"loss": 0.4146,
"step": 327
},
{
"epoch": 0.7084233261339092,
"grad_norm": 0.16798816621303558,
"learning_rate": 9.446443463961986e-06,
"loss": 0.3943,
"step": 328
},
{
"epoch": 0.7105831533477321,
"grad_norm": 0.16925224661827087,
"learning_rate": 9.440682244067724e-06,
"loss": 0.3992,
"step": 329
},
{
"epoch": 0.712742980561555,
"grad_norm": 0.19609062373638153,
"learning_rate": 9.434892974410932e-06,
"loss": 0.4094,
"step": 330
},
{
"epoch": 0.714902807775378,
"grad_norm": 0.19346529245376587,
"learning_rate": 9.429075691559788e-06,
"loss": 0.4018,
"step": 331
},
{
"epoch": 0.7170626349892009,
"grad_norm": 0.18897579610347748,
"learning_rate": 9.423230432259409e-06,
"loss": 0.4012,
"step": 332
},
{
"epoch": 0.7192224622030238,
"grad_norm": 0.16114945709705353,
"learning_rate": 9.41735723343163e-06,
"loss": 0.3988,
"step": 333
},
{
"epoch": 0.7213822894168467,
"grad_norm": 0.1889643669128418,
"learning_rate": 9.411456132174768e-06,
"loss": 0.3912,
"step": 334
},
{
"epoch": 0.7235421166306696,
"grad_norm": 0.20438078045845032,
"learning_rate": 9.405527165763384e-06,
"loss": 0.4036,
"step": 335
},
{
"epoch": 0.7257019438444925,
"grad_norm": 0.1676449030637741,
"learning_rate": 9.399570371648052e-06,
"loss": 0.4085,
"step": 336
},
{
"epoch": 0.7278617710583153,
"grad_norm": 0.23122479021549225,
"learning_rate": 9.393585787455125e-06,
"loss": 0.4075,
"step": 337
},
{
"epoch": 0.7300215982721382,
"grad_norm": 0.15428000688552856,
"learning_rate": 9.387573450986485e-06,
"loss": 0.3979,
"step": 338
},
{
"epoch": 0.7321814254859611,
"grad_norm": 0.1889045536518097,
"learning_rate": 9.381533400219319e-06,
"loss": 0.4004,
"step": 339
},
{
"epoch": 0.734341252699784,
"grad_norm": 0.16855069994926453,
"learning_rate": 9.37546567330587e-06,
"loss": 0.4021,
"step": 340
},
{
"epoch": 0.7365010799136069,
"grad_norm": 0.15914303064346313,
"learning_rate": 9.369370308573198e-06,
"loss": 0.4147,
"step": 341
},
{
"epoch": 0.7386609071274298,
"grad_norm": 0.18533971905708313,
"learning_rate": 9.363247344522939e-06,
"loss": 0.4025,
"step": 342
},
{
"epoch": 0.7408207343412527,
"grad_norm": 0.15280672907829285,
"learning_rate": 9.357096819831065e-06,
"loss": 0.4061,
"step": 343
},
{
"epoch": 0.7429805615550756,
"grad_norm": 0.1812913715839386,
"learning_rate": 9.35091877334763e-06,
"loss": 0.4008,
"step": 344
},
{
"epoch": 0.7451403887688985,
"grad_norm": 0.19496847689151764,
"learning_rate": 9.344713244096533e-06,
"loss": 0.4063,
"step": 345
},
{
"epoch": 0.7473002159827213,
"grad_norm": 0.15390554070472717,
"learning_rate": 9.33848027127527e-06,
"loss": 0.3943,
"step": 346
},
{
"epoch": 0.7494600431965442,
"grad_norm": 0.18108762800693512,
"learning_rate": 9.332219894254686e-06,
"loss": 0.4037,
"step": 347
},
{
"epoch": 0.7516198704103672,
"grad_norm": 0.172384575009346,
"learning_rate": 9.325932152578726e-06,
"loss": 0.404,
"step": 348
},
{
"epoch": 0.7537796976241901,
"grad_norm": 0.1718224287033081,
"learning_rate": 9.319617085964177e-06,
"loss": 0.4098,
"step": 349
},
{
"epoch": 0.755939524838013,
"grad_norm": 0.16733084619045258,
"learning_rate": 9.31327473430044e-06,
"loss": 0.41,
"step": 350
},
{
"epoch": 0.7580993520518359,
"grad_norm": 0.15835174918174744,
"learning_rate": 9.30690513764925e-06,
"loss": 0.4108,
"step": 351
},
{
"epoch": 0.7602591792656588,
"grad_norm": 0.16416366398334503,
"learning_rate": 9.300508336244443e-06,
"loss": 0.4123,
"step": 352
},
{
"epoch": 0.7624190064794817,
"grad_norm": 0.15685053169727325,
"learning_rate": 9.294084370491695e-06,
"loss": 0.4026,
"step": 353
},
{
"epoch": 0.7645788336933045,
"grad_norm": 0.17324267327785492,
"learning_rate": 9.287633280968263e-06,
"loss": 0.4043,
"step": 354
},
{
"epoch": 0.7667386609071274,
"grad_norm": 0.16480839252471924,
"learning_rate": 9.281155108422732e-06,
"loss": 0.3903,
"step": 355
},
{
"epoch": 0.7688984881209503,
"grad_norm": 0.155819833278656,
"learning_rate": 9.274649893774768e-06,
"loss": 0.4163,
"step": 356
},
{
"epoch": 0.7710583153347732,
"grad_norm": 0.1437472552061081,
"learning_rate": 9.268117678114833e-06,
"loss": 0.3983,
"step": 357
},
{
"epoch": 0.7732181425485961,
"grad_norm": 0.1644992083311081,
"learning_rate": 9.26155850270396e-06,
"loss": 0.4143,
"step": 358
},
{
"epoch": 0.775377969762419,
"grad_norm": 0.15442179143428802,
"learning_rate": 9.25497240897346e-06,
"loss": 0.4186,
"step": 359
},
{
"epoch": 0.7775377969762419,
"grad_norm": 0.16961856186389923,
"learning_rate": 9.248359438524683e-06,
"loss": 0.4056,
"step": 360
},
{
"epoch": 0.7796976241900648,
"grad_norm": 0.14529763162136078,
"learning_rate": 9.241719633128743e-06,
"loss": 0.4081,
"step": 361
},
{
"epoch": 0.7818574514038877,
"grad_norm": 0.17451095581054688,
"learning_rate": 9.235053034726261e-06,
"loss": 0.4011,
"step": 362
},
{
"epoch": 0.7840172786177105,
"grad_norm": 0.16993848979473114,
"learning_rate": 9.228359685427095e-06,
"loss": 0.4126,
"step": 363
},
{
"epoch": 0.7861771058315334,
"grad_norm": 0.1698153018951416,
"learning_rate": 9.221639627510076e-06,
"loss": 0.3983,
"step": 364
},
{
"epoch": 0.7883369330453563,
"grad_norm": 0.15617668628692627,
"learning_rate": 9.214892903422745e-06,
"loss": 0.3894,
"step": 365
},
{
"epoch": 0.7904967602591793,
"grad_norm": 0.1748441755771637,
"learning_rate": 9.208119555781074e-06,
"loss": 0.4042,
"step": 366
},
{
"epoch": 0.7926565874730022,
"grad_norm": 0.18701235949993134,
"learning_rate": 9.201319627369211e-06,
"loss": 0.4166,
"step": 367
},
{
"epoch": 0.7948164146868251,
"grad_norm": 0.15359680354595184,
"learning_rate": 9.1944931611392e-06,
"loss": 0.4025,
"step": 368
},
{
"epoch": 0.796976241900648,
"grad_norm": 0.17842437326908112,
"learning_rate": 9.18764020021071e-06,
"loss": 0.4157,
"step": 369
},
{
"epoch": 0.7991360691144709,
"grad_norm": 0.16838903725147247,
"learning_rate": 9.180760787870766e-06,
"loss": 0.4058,
"step": 370
},
{
"epoch": 0.8012958963282938,
"grad_norm": 0.17230413854122162,
"learning_rate": 9.173854967573479e-06,
"loss": 0.4063,
"step": 371
},
{
"epoch": 0.8034557235421166,
"grad_norm": 0.17813710868358612,
"learning_rate": 9.166922782939759e-06,
"loss": 0.4122,
"step": 372
},
{
"epoch": 0.8056155507559395,
"grad_norm": 0.19047455489635468,
"learning_rate": 9.159964277757054e-06,
"loss": 0.4026,
"step": 373
},
{
"epoch": 0.8077753779697624,
"grad_norm": 0.15476709604263306,
"learning_rate": 9.152979495979064e-06,
"loss": 0.3872,
"step": 374
},
{
"epoch": 0.8099352051835853,
"grad_norm": 0.15130369365215302,
"learning_rate": 9.145968481725466e-06,
"loss": 0.4018,
"step": 375
},
{
"epoch": 0.8120950323974082,
"grad_norm": 0.1687459796667099,
"learning_rate": 9.13893127928164e-06,
"loss": 0.3983,
"step": 376
},
{
"epoch": 0.8142548596112311,
"grad_norm": 0.1546049863100052,
"learning_rate": 9.131867933098379e-06,
"loss": 0.4109,
"step": 377
},
{
"epoch": 0.816414686825054,
"grad_norm": 0.1616266667842865,
"learning_rate": 9.124778487791615e-06,
"loss": 0.4039,
"step": 378
},
{
"epoch": 0.8185745140388769,
"grad_norm": 0.1830417811870575,
"learning_rate": 9.117662988142138e-06,
"loss": 0.4053,
"step": 379
},
{
"epoch": 0.8207343412526998,
"grad_norm": 0.15087199211120605,
"learning_rate": 9.110521479095314e-06,
"loss": 0.4111,
"step": 380
},
{
"epoch": 0.8228941684665226,
"grad_norm": 0.15791049599647522,
"learning_rate": 9.10335400576079e-06,
"loss": 0.3882,
"step": 381
},
{
"epoch": 0.8250539956803455,
"grad_norm": 0.16011568903923035,
"learning_rate": 9.096160613412228e-06,
"loss": 0.4101,
"step": 382
},
{
"epoch": 0.8272138228941684,
"grad_norm": 0.1656263768672943,
"learning_rate": 9.088941347487004e-06,
"loss": 0.394,
"step": 383
},
{
"epoch": 0.8293736501079914,
"grad_norm": 0.15749986469745636,
"learning_rate": 9.08169625358592e-06,
"loss": 0.3972,
"step": 384
},
{
"epoch": 0.8315334773218143,
"grad_norm": 0.15940222144126892,
"learning_rate": 9.074425377472932e-06,
"loss": 0.4003,
"step": 385
},
{
"epoch": 0.8336933045356372,
"grad_norm": 0.17559286952018738,
"learning_rate": 9.067128765074842e-06,
"loss": 0.4046,
"step": 386
},
{
"epoch": 0.8358531317494601,
"grad_norm": 0.1646784096956253,
"learning_rate": 9.059806462481022e-06,
"loss": 0.3968,
"step": 387
},
{
"epoch": 0.838012958963283,
"grad_norm": 0.16697706282138824,
"learning_rate": 9.052458515943112e-06,
"loss": 0.4146,
"step": 388
},
{
"epoch": 0.8401727861771058,
"grad_norm": 0.17301729321479797,
"learning_rate": 9.045084971874738e-06,
"loss": 0.4037,
"step": 389
},
{
"epoch": 0.8423326133909287,
"grad_norm": 0.1766882836818695,
"learning_rate": 9.037685876851211e-06,
"loss": 0.4019,
"step": 390
},
{
"epoch": 0.8444924406047516,
"grad_norm": 0.16974475979804993,
"learning_rate": 9.030261277609235e-06,
"loss": 0.3978,
"step": 391
},
{
"epoch": 0.8466522678185745,
"grad_norm": 0.17788070440292358,
"learning_rate": 9.022811221046618e-06,
"loss": 0.4062,
"step": 392
},
{
"epoch": 0.8488120950323974,
"grad_norm": 0.16667339205741882,
"learning_rate": 9.015335754221964e-06,
"loss": 0.4167,
"step": 393
},
{
"epoch": 0.8509719222462203,
"grad_norm": 0.15693309903144836,
"learning_rate": 9.007834924354384e-06,
"loss": 0.3988,
"step": 394
},
{
"epoch": 0.8531317494600432,
"grad_norm": 0.16362878680229187,
"learning_rate": 9.000308778823196e-06,
"loss": 0.3995,
"step": 395
},
{
"epoch": 0.8552915766738661,
"grad_norm": 0.14635585248470306,
"learning_rate": 8.992757365167625e-06,
"loss": 0.4028,
"step": 396
},
{
"epoch": 0.857451403887689,
"grad_norm": 0.16527874767780304,
"learning_rate": 8.985180731086505e-06,
"loss": 0.406,
"step": 397
},
{
"epoch": 0.8596112311015118,
"grad_norm": 0.2163344919681549,
"learning_rate": 8.977578924437976e-06,
"loss": 0.3985,
"step": 398
},
{
"epoch": 0.8617710583153347,
"grad_norm": 0.14798112213611603,
"learning_rate": 8.969951993239177e-06,
"loss": 0.4011,
"step": 399
},
{
"epoch": 0.8639308855291576,
"grad_norm": 0.16196613013744354,
"learning_rate": 8.962299985665955e-06,
"loss": 0.4057,
"step": 400
},
{
"epoch": 0.8660907127429806,
"grad_norm": 0.15940962731838226,
"learning_rate": 8.954622950052543e-06,
"loss": 0.4027,
"step": 401
},
{
"epoch": 0.8682505399568035,
"grad_norm": 0.16603127121925354,
"learning_rate": 8.946920934891274e-06,
"loss": 0.4106,
"step": 402
},
{
"epoch": 0.8704103671706264,
"grad_norm": 0.16625916957855225,
"learning_rate": 8.939193988832261e-06,
"loss": 0.3997,
"step": 403
},
{
"epoch": 0.8725701943844493,
"grad_norm": 0.17211325466632843,
"learning_rate": 8.931442160683094e-06,
"loss": 0.4036,
"step": 404
},
{
"epoch": 0.8747300215982722,
"grad_norm": 0.17657049000263214,
"learning_rate": 8.923665499408535e-06,
"loss": 0.393,
"step": 405
},
{
"epoch": 0.8768898488120951,
"grad_norm": 0.18346846103668213,
"learning_rate": 8.915864054130203e-06,
"loss": 0.3911,
"step": 406
},
{
"epoch": 0.8790496760259179,
"grad_norm": 0.17051193118095398,
"learning_rate": 8.908037874126263e-06,
"loss": 0.3916,
"step": 407
},
{
"epoch": 0.8812095032397408,
"grad_norm": 0.15643054246902466,
"learning_rate": 8.900187008831124e-06,
"loss": 0.3957,
"step": 408
},
{
"epoch": 0.8833693304535637,
"grad_norm": 0.18112455308437347,
"learning_rate": 8.892311507835118e-06,
"loss": 0.4006,
"step": 409
},
{
"epoch": 0.8855291576673866,
"grad_norm": 0.1472531408071518,
"learning_rate": 8.88441142088419e-06,
"loss": 0.3969,
"step": 410
},
{
"epoch": 0.8876889848812095,
"grad_norm": 0.16634514927864075,
"learning_rate": 8.87648679787958e-06,
"loss": 0.4052,
"step": 411
},
{
"epoch": 0.8898488120950324,
"grad_norm": 0.16606342792510986,
"learning_rate": 8.868537688877516e-06,
"loss": 0.3999,
"step": 412
},
{
"epoch": 0.8920086393088553,
"grad_norm": 0.16223309934139252,
"learning_rate": 8.860564144088891e-06,
"loss": 0.4053,
"step": 413
},
{
"epoch": 0.8941684665226782,
"grad_norm": 0.17775796353816986,
"learning_rate": 8.852566213878947e-06,
"loss": 0.3996,
"step": 414
},
{
"epoch": 0.896328293736501,
"grad_norm": 0.16113241016864777,
"learning_rate": 8.844543948766958e-06,
"loss": 0.3874,
"step": 415
},
{
"epoch": 0.8984881209503239,
"grad_norm": 0.19586795568466187,
"learning_rate": 8.83649739942591e-06,
"loss": 0.4012,
"step": 416
},
{
"epoch": 0.9006479481641468,
"grad_norm": 0.18052950501441956,
"learning_rate": 8.828426616682184e-06,
"loss": 0.3973,
"step": 417
},
{
"epoch": 0.9028077753779697,
"grad_norm": 0.16518956422805786,
"learning_rate": 8.820331651515226e-06,
"loss": 0.3997,
"step": 418
},
{
"epoch": 0.9049676025917927,
"grad_norm": 0.1827470362186432,
"learning_rate": 8.81221255505724e-06,
"loss": 0.4008,
"step": 419
},
{
"epoch": 0.9071274298056156,
"grad_norm": 0.18678082525730133,
"learning_rate": 8.80406937859285e-06,
"loss": 0.3953,
"step": 420
},
{
"epoch": 0.9092872570194385,
"grad_norm": 0.1759604662656784,
"learning_rate": 8.795902173558784e-06,
"loss": 0.4037,
"step": 421
},
{
"epoch": 0.9114470842332614,
"grad_norm": 0.1986621916294098,
"learning_rate": 8.787710991543547e-06,
"loss": 0.4125,
"step": 422
},
{
"epoch": 0.9136069114470843,
"grad_norm": 0.19601307809352875,
"learning_rate": 8.779495884287099e-06,
"loss": 0.4018,
"step": 423
},
{
"epoch": 0.9157667386609071,
"grad_norm": 0.16699747741222382,
"learning_rate": 8.77125690368052e-06,
"loss": 0.4029,
"step": 424
},
{
"epoch": 0.91792656587473,
"grad_norm": 0.16781239211559296,
"learning_rate": 8.76299410176569e-06,
"loss": 0.3956,
"step": 425
},
{
"epoch": 0.9200863930885529,
"grad_norm": 0.17204856872558594,
"learning_rate": 8.754707530734958e-06,
"loss": 0.4033,
"step": 426
},
{
"epoch": 0.9222462203023758,
"grad_norm": 0.1568082720041275,
"learning_rate": 8.74639724293081e-06,
"loss": 0.3937,
"step": 427
},
{
"epoch": 0.9244060475161987,
"grad_norm": 0.18325375020503998,
"learning_rate": 8.738063290845536e-06,
"loss": 0.4077,
"step": 428
},
{
"epoch": 0.9265658747300216,
"grad_norm": 0.15343928337097168,
"learning_rate": 8.729705727120911e-06,
"loss": 0.3997,
"step": 429
},
{
"epoch": 0.9287257019438445,
"grad_norm": 0.1750892996788025,
"learning_rate": 8.721324604547851e-06,
"loss": 0.4151,
"step": 430
},
{
"epoch": 0.9308855291576674,
"grad_norm": 0.17041905224323273,
"learning_rate": 8.712919976066078e-06,
"loss": 0.4051,
"step": 431
},
{
"epoch": 0.9330453563714903,
"grad_norm": 0.17677395045757294,
"learning_rate": 8.704491894763794e-06,
"loss": 0.4031,
"step": 432
},
{
"epoch": 0.9352051835853131,
"grad_norm": 0.2149730920791626,
"learning_rate": 8.696040413877344e-06,
"loss": 0.4029,
"step": 433
},
{
"epoch": 0.937365010799136,
"grad_norm": 0.17261390388011932,
"learning_rate": 8.68756558679087e-06,
"loss": 0.3998,
"step": 434
},
{
"epoch": 0.9395248380129589,
"grad_norm": 0.17588981986045837,
"learning_rate": 8.679067467035989e-06,
"loss": 0.4127,
"step": 435
},
{
"epoch": 0.9416846652267818,
"grad_norm": 0.18429699540138245,
"learning_rate": 8.670546108291443e-06,
"loss": 0.3987,
"step": 436
},
{
"epoch": 0.9438444924406048,
"grad_norm": 0.15987183153629303,
"learning_rate": 8.662001564382768e-06,
"loss": 0.3911,
"step": 437
},
{
"epoch": 0.9460043196544277,
"grad_norm": 0.17549017071723938,
"learning_rate": 8.65343388928194e-06,
"loss": 0.4068,
"step": 438
},
{
"epoch": 0.9481641468682506,
"grad_norm": 0.1644325852394104,
"learning_rate": 8.644843137107058e-06,
"loss": 0.3938,
"step": 439
},
{
"epoch": 0.9503239740820735,
"grad_norm": 0.18092772364616394,
"learning_rate": 8.636229362121979e-06,
"loss": 0.4036,
"step": 440
},
{
"epoch": 0.9524838012958964,
"grad_norm": 0.19745442271232605,
"learning_rate": 8.627592618735989e-06,
"loss": 0.4131,
"step": 441
},
{
"epoch": 0.9546436285097192,
"grad_norm": 0.15399040281772614,
"learning_rate": 8.618932961503452e-06,
"loss": 0.3956,
"step": 442
},
{
"epoch": 0.9568034557235421,
"grad_norm": 0.21613968908786774,
"learning_rate": 8.610250445123472e-06,
"loss": 0.3957,
"step": 443
},
{
"epoch": 0.958963282937365,
"grad_norm": 0.15756168961524963,
"learning_rate": 8.601545124439535e-06,
"loss": 0.401,
"step": 444
},
{
"epoch": 0.9611231101511879,
"grad_norm": 0.16475795209407806,
"learning_rate": 8.592817054439184e-06,
"loss": 0.4091,
"step": 445
},
{
"epoch": 0.9632829373650108,
"grad_norm": 0.17942647635936737,
"learning_rate": 8.584066290253649e-06,
"loss": 0.3818,
"step": 446
},
{
"epoch": 0.9654427645788337,
"grad_norm": 0.1804707795381546,
"learning_rate": 8.575292887157515e-06,
"loss": 0.4036,
"step": 447
},
{
"epoch": 0.9676025917926566,
"grad_norm": 0.1610308587551117,
"learning_rate": 8.566496900568364e-06,
"loss": 0.4046,
"step": 448
},
{
"epoch": 0.9697624190064795,
"grad_norm": 0.17367208003997803,
"learning_rate": 8.557678386046429e-06,
"loss": 0.399,
"step": 449
},
{
"epoch": 0.9719222462203023,
"grad_norm": 0.16975344717502594,
"learning_rate": 8.548837399294235e-06,
"loss": 0.3973,
"step": 450
},
{
"epoch": 0.9740820734341252,
"grad_norm": 0.16336052119731903,
"learning_rate": 8.539973996156265e-06,
"loss": 0.4077,
"step": 451
},
{
"epoch": 0.9762419006479481,
"grad_norm": 0.16016145050525665,
"learning_rate": 8.531088232618587e-06,
"loss": 0.4005,
"step": 452
},
{
"epoch": 0.978401727861771,
"grad_norm": 0.15805621445178986,
"learning_rate": 8.522180164808515e-06,
"loss": 0.3885,
"step": 453
},
{
"epoch": 0.980561555075594,
"grad_norm": 0.15626148879528046,
"learning_rate": 8.513249848994248e-06,
"loss": 0.3912,
"step": 454
},
{
"epoch": 0.9827213822894169,
"grad_norm": 0.1786354035139084,
"learning_rate": 8.504297341584509e-06,
"loss": 0.4034,
"step": 455
},
{
"epoch": 0.9848812095032398,
"grad_norm": 0.1438089907169342,
"learning_rate": 8.495322699128206e-06,
"loss": 0.4003,
"step": 456
},
{
"epoch": 0.9870410367170627,
"grad_norm": 0.16011767089366913,
"learning_rate": 8.486325978314054e-06,
"loss": 0.3985,
"step": 457
},
{
"epoch": 0.9892008639308856,
"grad_norm": 0.18413770198822021,
"learning_rate": 8.477307235970235e-06,
"loss": 0.3855,
"step": 458
},
{
"epoch": 0.9913606911447084,
"grad_norm": 0.15895338356494904,
"learning_rate": 8.468266529064025e-06,
"loss": 0.3918,
"step": 459
},
{
"epoch": 0.9935205183585313,
"grad_norm": 0.172573059797287,
"learning_rate": 8.459203914701444e-06,
"loss": 0.3903,
"step": 460
},
{
"epoch": 0.9956803455723542,
"grad_norm": 0.1525600552558899,
"learning_rate": 8.450119450126889e-06,
"loss": 0.4066,
"step": 461
},
{
"epoch": 0.9978401727861771,
"grad_norm": 0.1875782459974289,
"learning_rate": 8.441013192722774e-06,
"loss": 0.405,
"step": 462
},
{
"epoch": 1.0,
"grad_norm": 0.18118026852607727,
"learning_rate": 8.431885200009172e-06,
"loss": 0.402,
"step": 463
},
{
"epoch": 1.0021598272138228,
"grad_norm": 0.1752985566854477,
"learning_rate": 8.422735529643445e-06,
"loss": 0.3926,
"step": 464
},
{
"epoch": 1.0043196544276458,
"grad_norm": 0.1703169196844101,
"learning_rate": 8.413564239419883e-06,
"loss": 0.3838,
"step": 465
},
{
"epoch": 1.0064794816414686,
"grad_norm": 0.181954026222229,
"learning_rate": 8.404371387269341e-06,
"loss": 0.3863,
"step": 466
},
{
"epoch": 1.0086393088552916,
"grad_norm": 0.16215139627456665,
"learning_rate": 8.39515703125887e-06,
"loss": 0.3849,
"step": 467
},
{
"epoch": 1.0107991360691144,
"grad_norm": 0.23999503254890442,
"learning_rate": 8.385921229591351e-06,
"loss": 0.3917,
"step": 468
},
{
"epoch": 1.0129589632829374,
"grad_norm": 0.1752462089061737,
"learning_rate": 8.376664040605122e-06,
"loss": 0.3812,
"step": 469
},
{
"epoch": 1.0151187904967602,
"grad_norm": 0.17159010469913483,
"learning_rate": 8.367385522773625e-06,
"loss": 0.386,
"step": 470
},
{
"epoch": 1.0172786177105833,
"grad_norm": 0.19381286203861237,
"learning_rate": 8.358085734705021e-06,
"loss": 0.3958,
"step": 471
},
{
"epoch": 1.019438444924406,
"grad_norm": 0.17137818038463593,
"learning_rate": 8.348764735141823e-06,
"loss": 0.3867,
"step": 472
},
{
"epoch": 1.0215982721382288,
"grad_norm": 0.18011374771595,
"learning_rate": 8.339422582960533e-06,
"loss": 0.3974,
"step": 473
},
{
"epoch": 1.0237580993520519,
"grad_norm": 0.18092289566993713,
"learning_rate": 8.33005933717126e-06,
"loss": 0.3697,
"step": 474
},
{
"epoch": 1.0259179265658747,
"grad_norm": 0.15359187126159668,
"learning_rate": 8.320675056917353e-06,
"loss": 0.3813,
"step": 475
},
{
"epoch": 1.0280777537796977,
"grad_norm": 0.16927658021450043,
"learning_rate": 8.311269801475026e-06,
"loss": 0.3834,
"step": 476
},
{
"epoch": 1.0302375809935205,
"grad_norm": 0.17222736775875092,
"learning_rate": 8.301843630252986e-06,
"loss": 0.3869,
"step": 477
},
{
"epoch": 1.0323974082073435,
"grad_norm": 0.17333416640758514,
"learning_rate": 8.29239660279205e-06,
"loss": 0.3853,
"step": 478
},
{
"epoch": 1.0345572354211663,
"grad_norm": 0.18697193264961243,
"learning_rate": 8.282928778764783e-06,
"loss": 0.3974,
"step": 479
},
{
"epoch": 1.0367170626349893,
"grad_norm": 0.1769992560148239,
"learning_rate": 8.273440217975103e-06,
"loss": 0.39,
"step": 480
},
{
"epoch": 1.038876889848812,
"grad_norm": 0.1826915144920349,
"learning_rate": 8.26393098035792e-06,
"loss": 0.383,
"step": 481
},
{
"epoch": 1.041036717062635,
"grad_norm": 0.18807494640350342,
"learning_rate": 8.254401125978744e-06,
"loss": 0.3875,
"step": 482
},
{
"epoch": 1.043196544276458,
"grad_norm": 0.1729234904050827,
"learning_rate": 8.244850715033316e-06,
"loss": 0.3888,
"step": 483
},
{
"epoch": 1.0453563714902807,
"grad_norm": 0.18379338085651398,
"learning_rate": 8.235279807847223e-06,
"loss": 0.3867,
"step": 484
},
{
"epoch": 1.0475161987041037,
"grad_norm": 0.1450575441122055,
"learning_rate": 8.225688464875514e-06,
"loss": 0.3895,
"step": 485
},
{
"epoch": 1.0496760259179265,
"grad_norm": 0.15889526903629303,
"learning_rate": 8.216076746702327e-06,
"loss": 0.3817,
"step": 486
},
{
"epoch": 1.0518358531317495,
"grad_norm": 0.16916847229003906,
"learning_rate": 8.206444714040496e-06,
"loss": 0.382,
"step": 487
},
{
"epoch": 1.0539956803455723,
"grad_norm": 0.1597558856010437,
"learning_rate": 8.196792427731175e-06,
"loss": 0.3905,
"step": 488
},
{
"epoch": 1.0561555075593954,
"grad_norm": 0.1566566675901413,
"learning_rate": 8.18711994874345e-06,
"loss": 0.3841,
"step": 489
},
{
"epoch": 1.0583153347732182,
"grad_norm": 0.17559486627578735,
"learning_rate": 8.177427338173955e-06,
"loss": 0.3792,
"step": 490
},
{
"epoch": 1.060475161987041,
"grad_norm": 0.15165618062019348,
"learning_rate": 8.167714657246486e-06,
"loss": 0.3804,
"step": 491
},
{
"epoch": 1.062634989200864,
"grad_norm": 0.15612950921058655,
"learning_rate": 8.157981967311614e-06,
"loss": 0.382,
"step": 492
},
{
"epoch": 1.0647948164146868,
"grad_norm": 0.16774365305900574,
"learning_rate": 8.1482293298463e-06,
"loss": 0.3905,
"step": 493
},
{
"epoch": 1.0669546436285098,
"grad_norm": 0.1574973613023758,
"learning_rate": 8.138456806453503e-06,
"loss": 0.3881,
"step": 494
},
{
"epoch": 1.0691144708423326,
"grad_norm": 0.20335029065608978,
"learning_rate": 8.12866445886179e-06,
"loss": 0.3752,
"step": 495
},
{
"epoch": 1.0712742980561556,
"grad_norm": 0.15830448269844055,
"learning_rate": 8.118852348924951e-06,
"loss": 0.3814,
"step": 496
},
{
"epoch": 1.0734341252699784,
"grad_norm": 0.20952075719833374,
"learning_rate": 8.109020538621607e-06,
"loss": 0.3798,
"step": 497
},
{
"epoch": 1.0755939524838012,
"grad_norm": 0.18261830508708954,
"learning_rate": 8.099169090054812e-06,
"loss": 0.3895,
"step": 498
},
{
"epoch": 1.0777537796976242,
"grad_norm": 0.20644772052764893,
"learning_rate": 8.089298065451673e-06,
"loss": 0.3744,
"step": 499
},
{
"epoch": 1.079913606911447,
"grad_norm": 0.17039693892002106,
"learning_rate": 8.079407527162944e-06,
"loss": 0.385,
"step": 500
},
{
"epoch": 1.08207343412527,
"grad_norm": 0.1829117089509964,
"learning_rate": 8.069497537662638e-06,
"loss": 0.3745,
"step": 501
},
{
"epoch": 1.0842332613390928,
"grad_norm": 0.16001296043395996,
"learning_rate": 8.05956815954764e-06,
"loss": 0.3796,
"step": 502
},
{
"epoch": 1.0863930885529158,
"grad_norm": 0.1937176138162613,
"learning_rate": 8.049619455537296e-06,
"loss": 0.3814,
"step": 503
},
{
"epoch": 1.0885529157667386,
"grad_norm": 0.15796703100204468,
"learning_rate": 8.039651488473028e-06,
"loss": 0.3804,
"step": 504
},
{
"epoch": 1.0907127429805616,
"grad_norm": 0.21041610836982727,
"learning_rate": 8.029664321317932e-06,
"loss": 0.3862,
"step": 505
},
{
"epoch": 1.0928725701943844,
"grad_norm": 0.18780550360679626,
"learning_rate": 8.019658017156384e-06,
"loss": 0.3807,
"step": 506
},
{
"epoch": 1.0950323974082075,
"grad_norm": 0.1692945808172226,
"learning_rate": 8.009632639193643e-06,
"loss": 0.3845,
"step": 507
},
{
"epoch": 1.0971922246220303,
"grad_norm": 0.18981167674064636,
"learning_rate": 7.999588250755442e-06,
"loss": 0.3848,
"step": 508
},
{
"epoch": 1.099352051835853,
"grad_norm": 0.15760387480258942,
"learning_rate": 7.989524915287595e-06,
"loss": 0.3757,
"step": 509
},
{
"epoch": 1.101511879049676,
"grad_norm": 0.140371173620224,
"learning_rate": 7.979442696355601e-06,
"loss": 0.3825,
"step": 510
},
{
"epoch": 1.1036717062634989,
"grad_norm": 0.15832389891147614,
"learning_rate": 7.969341657644236e-06,
"loss": 0.3863,
"step": 511
},
{
"epoch": 1.1058315334773219,
"grad_norm": 0.14990824460983276,
"learning_rate": 7.959221862957149e-06,
"loss": 0.3917,
"step": 512
},
{
"epoch": 1.1079913606911447,
"grad_norm": 0.15084333717823029,
"learning_rate": 7.94908337621646e-06,
"loss": 0.3863,
"step": 513
},
{
"epoch": 1.1101511879049677,
"grad_norm": 0.15440189838409424,
"learning_rate": 7.938926261462366e-06,
"loss": 0.3785,
"step": 514
},
{
"epoch": 1.1123110151187905,
"grad_norm": 0.16310814023017883,
"learning_rate": 7.928750582852722e-06,
"loss": 0.3796,
"step": 515
},
{
"epoch": 1.1144708423326133,
"grad_norm": 0.15400250256061554,
"learning_rate": 7.918556404662645e-06,
"loss": 0.3913,
"step": 516
},
{
"epoch": 1.1166306695464363,
"grad_norm": 0.16480040550231934,
"learning_rate": 7.908343791284104e-06,
"loss": 0.3817,
"step": 517
},
{
"epoch": 1.118790496760259,
"grad_norm": 0.14894555509090424,
"learning_rate": 7.898112807225517e-06,
"loss": 0.3797,
"step": 518
},
{
"epoch": 1.1209503239740821,
"grad_norm": 0.16937804222106934,
"learning_rate": 7.887863517111337e-06,
"loss": 0.3832,
"step": 519
},
{
"epoch": 1.123110151187905,
"grad_norm": 0.1606750190258026,
"learning_rate": 7.877595985681656e-06,
"loss": 0.3735,
"step": 520
},
{
"epoch": 1.125269978401728,
"grad_norm": 0.1703948825597763,
"learning_rate": 7.867310277791778e-06,
"loss": 0.3754,
"step": 521
},
{
"epoch": 1.1274298056155507,
"grad_norm": 0.1625399887561798,
"learning_rate": 7.857006458411826e-06,
"loss": 0.3773,
"step": 522
},
{
"epoch": 1.1295896328293737,
"grad_norm": 0.17872779071331024,
"learning_rate": 7.846684592626324e-06,
"loss": 0.3867,
"step": 523
},
{
"epoch": 1.1317494600431965,
"grad_norm": 0.14789296686649323,
"learning_rate": 7.836344745633785e-06,
"loss": 0.3794,
"step": 524
},
{
"epoch": 1.1339092872570196,
"grad_norm": 0.15560902655124664,
"learning_rate": 7.8259869827463e-06,
"loss": 0.3795,
"step": 525
},
{
"epoch": 1.1360691144708424,
"grad_norm": 0.1677931696176529,
"learning_rate": 7.815611369389134e-06,
"loss": 0.3921,
"step": 526
},
{
"epoch": 1.1382289416846652,
"grad_norm": 0.15654879808425903,
"learning_rate": 7.805217971100295e-06,
"loss": 0.3893,
"step": 527
},
{
"epoch": 1.1403887688984882,
"grad_norm": 0.15903332829475403,
"learning_rate": 7.794806853530139e-06,
"loss": 0.3791,
"step": 528
},
{
"epoch": 1.142548596112311,
"grad_norm": 0.1683822125196457,
"learning_rate": 7.78437808244094e-06,
"loss": 0.3877,
"step": 529
},
{
"epoch": 1.144708423326134,
"grad_norm": 0.15309610962867737,
"learning_rate": 7.773931723706487e-06,
"loss": 0.3746,
"step": 530
},
{
"epoch": 1.1468682505399568,
"grad_norm": 0.14578138291835785,
"learning_rate": 7.763467843311658e-06,
"loss": 0.3767,
"step": 531
},
{
"epoch": 1.1490280777537798,
"grad_norm": 0.16950742900371552,
"learning_rate": 7.752986507352009e-06,
"loss": 0.3873,
"step": 532
},
{
"epoch": 1.1511879049676026,
"grad_norm": 0.1471281796693802,
"learning_rate": 7.742487782033352e-06,
"loss": 0.3837,
"step": 533
},
{
"epoch": 1.1533477321814254,
"grad_norm": 0.14385339617729187,
"learning_rate": 7.731971733671347e-06,
"loss": 0.3944,
"step": 534
},
{
"epoch": 1.1555075593952484,
"grad_norm": 0.14128537476062775,
"learning_rate": 7.721438428691065e-06,
"loss": 0.3802,
"step": 535
},
{
"epoch": 1.1576673866090712,
"grad_norm": 0.1677146852016449,
"learning_rate": 7.71088793362659e-06,
"loss": 0.3812,
"step": 536
},
{
"epoch": 1.1598272138228942,
"grad_norm": 0.14564774930477142,
"learning_rate": 7.70032031512058e-06,
"loss": 0.3827,
"step": 537
},
{
"epoch": 1.161987041036717,
"grad_norm": 0.15598656237125397,
"learning_rate": 7.689735639923857e-06,
"loss": 0.3829,
"step": 538
},
{
"epoch": 1.16414686825054,
"grad_norm": 0.14980514347553253,
"learning_rate": 7.679133974894984e-06,
"loss": 0.3767,
"step": 539
},
{
"epoch": 1.1663066954643628,
"grad_norm": 0.15688128769397736,
"learning_rate": 7.668515386999837e-06,
"loss": 0.3931,
"step": 540
},
{
"epoch": 1.1684665226781856,
"grad_norm": 0.15419645607471466,
"learning_rate": 7.65787994331119e-06,
"loss": 0.375,
"step": 541
},
{
"epoch": 1.1706263498920086,
"grad_norm": 0.15213316679000854,
"learning_rate": 7.647227711008288e-06,
"loss": 0.3841,
"step": 542
},
{
"epoch": 1.1727861771058314,
"grad_norm": 0.14635787904262543,
"learning_rate": 7.636558757376413e-06,
"loss": 0.379,
"step": 543
},
{
"epoch": 1.1749460043196545,
"grad_norm": 0.1601177304983139,
"learning_rate": 7.6258731498064796e-06,
"loss": 0.3741,
"step": 544
},
{
"epoch": 1.1771058315334773,
"grad_norm": 0.15203504264354706,
"learning_rate": 7.615170955794592e-06,
"loss": 0.3764,
"step": 545
},
{
"epoch": 1.1792656587473003,
"grad_norm": 0.1715112179517746,
"learning_rate": 7.604452242941622e-06,
"loss": 0.3811,
"step": 546
},
{
"epoch": 1.181425485961123,
"grad_norm": 0.17397920787334442,
"learning_rate": 7.593717078952788e-06,
"loss": 0.3826,
"step": 547
},
{
"epoch": 1.183585313174946,
"grad_norm": 0.14259247481822968,
"learning_rate": 7.582965531637221e-06,
"loss": 0.3725,
"step": 548
},
{
"epoch": 1.1857451403887689,
"grad_norm": 0.16911283135414124,
"learning_rate": 7.572197668907533e-06,
"loss": 0.3915,
"step": 549
},
{
"epoch": 1.187904967602592,
"grad_norm": 0.1575639694929123,
"learning_rate": 7.561413558779401e-06,
"loss": 0.3719,
"step": 550
},
{
"epoch": 1.1900647948164147,
"grad_norm": 0.15729346871376038,
"learning_rate": 7.550613269371124e-06,
"loss": 0.3802,
"step": 551
},
{
"epoch": 1.1922246220302375,
"grad_norm": 0.16103574633598328,
"learning_rate": 7.5397968689032e-06,
"loss": 0.379,
"step": 552
},
{
"epoch": 1.1943844492440605,
"grad_norm": 0.16614358127117157,
"learning_rate": 7.528964425697895e-06,
"loss": 0.3874,
"step": 553
},
{
"epoch": 1.1965442764578833,
"grad_norm": 0.14216990768909454,
"learning_rate": 7.518116008178805e-06,
"loss": 0.3791,
"step": 554
},
{
"epoch": 1.1987041036717063,
"grad_norm": 0.15424562990665436,
"learning_rate": 7.507251684870433e-06,
"loss": 0.3855,
"step": 555
},
{
"epoch": 1.2008639308855291,
"grad_norm": 0.15728497505187988,
"learning_rate": 7.496371524397747e-06,
"loss": 0.3767,
"step": 556
},
{
"epoch": 1.2030237580993521,
"grad_norm": 0.16239339113235474,
"learning_rate": 7.485475595485756e-06,
"loss": 0.39,
"step": 557
},
{
"epoch": 1.205183585313175,
"grad_norm": 0.18078574538230896,
"learning_rate": 7.474563966959068e-06,
"loss": 0.3805,
"step": 558
},
{
"epoch": 1.2073434125269977,
"grad_norm": 0.1507551670074463,
"learning_rate": 7.463636707741458e-06,
"loss": 0.385,
"step": 559
},
{
"epoch": 1.2095032397408207,
"grad_norm": 0.1794394552707672,
"learning_rate": 7.452693886855438e-06,
"loss": 0.3869,
"step": 560
},
{
"epoch": 1.2116630669546435,
"grad_norm": 0.17479896545410156,
"learning_rate": 7.4417355734218085e-06,
"loss": 0.3763,
"step": 561
},
{
"epoch": 1.2138228941684666,
"grad_norm": 0.15585078299045563,
"learning_rate": 7.430761836659235e-06,
"loss": 0.3893,
"step": 562
},
{
"epoch": 1.2159827213822894,
"grad_norm": 0.17647355794906616,
"learning_rate": 7.4197727458837995e-06,
"loss": 0.3858,
"step": 563
},
{
"epoch": 1.2181425485961124,
"grad_norm": 0.1657349020242691,
"learning_rate": 7.408768370508577e-06,
"loss": 0.3787,
"step": 564
},
{
"epoch": 1.2203023758099352,
"grad_norm": 0.15990415215492249,
"learning_rate": 7.397748780043179e-06,
"loss": 0.3816,
"step": 565
},
{
"epoch": 1.2224622030237582,
"grad_norm": 0.16552990674972534,
"learning_rate": 7.386714044093331e-06,
"loss": 0.3818,
"step": 566
},
{
"epoch": 1.224622030237581,
"grad_norm": 0.17762261629104614,
"learning_rate": 7.375664232360421e-06,
"loss": 0.3823,
"step": 567
},
{
"epoch": 1.226781857451404,
"grad_norm": 0.17362867295742035,
"learning_rate": 7.364599414641064e-06,
"loss": 0.3796,
"step": 568
},
{
"epoch": 1.2289416846652268,
"grad_norm": 0.15305167436599731,
"learning_rate": 7.353519660826665e-06,
"loss": 0.3816,
"step": 569
},
{
"epoch": 1.2311015118790496,
"grad_norm": 0.1919698268175125,
"learning_rate": 7.342425040902967e-06,
"loss": 0.3927,
"step": 570
},
{
"epoch": 1.2332613390928726,
"grad_norm": 0.15654806792736053,
"learning_rate": 7.331315624949624e-06,
"loss": 0.3844,
"step": 571
},
{
"epoch": 1.2354211663066954,
"grad_norm": 0.1898239254951477,
"learning_rate": 7.320191483139742e-06,
"loss": 0.3935,
"step": 572
},
{
"epoch": 1.2375809935205184,
"grad_norm": 0.15385276079177856,
"learning_rate": 7.309052685739448e-06,
"loss": 0.3731,
"step": 573
},
{
"epoch": 1.2397408207343412,
"grad_norm": 0.15585872530937195,
"learning_rate": 7.297899303107441e-06,
"loss": 0.3802,
"step": 574
},
{
"epoch": 1.2419006479481642,
"grad_norm": 0.14450909197330475,
"learning_rate": 7.286731405694544e-06,
"loss": 0.368,
"step": 575
},
{
"epoch": 1.244060475161987,
"grad_norm": 0.15306542813777924,
"learning_rate": 7.275549064043269e-06,
"loss": 0.3827,
"step": 576
},
{
"epoch": 1.2462203023758098,
"grad_norm": 0.15712149441242218,
"learning_rate": 7.264352348787364e-06,
"loss": 0.3933,
"step": 577
},
{
"epoch": 1.2483801295896328,
"grad_norm": 0.16853763163089752,
"learning_rate": 7.253141330651367e-06,
"loss": 0.3886,
"step": 578
},
{
"epoch": 1.2505399568034556,
"grad_norm": 0.15141934156417847,
"learning_rate": 7.241916080450163e-06,
"loss": 0.373,
"step": 579
},
{
"epoch": 1.2526997840172787,
"grad_norm": 0.16748425364494324,
"learning_rate": 7.23067666908853e-06,
"loss": 0.3779,
"step": 580
},
{
"epoch": 1.2548596112311015,
"grad_norm": 0.15394426882266998,
"learning_rate": 7.219423167560701e-06,
"loss": 0.3803,
"step": 581
},
{
"epoch": 1.2570194384449245,
"grad_norm": 0.15716637670993805,
"learning_rate": 7.208155646949908e-06,
"loss": 0.3903,
"step": 582
},
{
"epoch": 1.2591792656587473,
"grad_norm": 0.17571674287319183,
"learning_rate": 7.196874178427933e-06,
"loss": 0.3693,
"step": 583
},
{
"epoch": 1.26133909287257,
"grad_norm": 0.16210925579071045,
"learning_rate": 7.185578833254665e-06,
"loss": 0.3806,
"step": 584
},
{
"epoch": 1.263498920086393,
"grad_norm": 0.17312122881412506,
"learning_rate": 7.1742696827776415e-06,
"loss": 0.3867,
"step": 585
},
{
"epoch": 1.265658747300216,
"grad_norm": 0.16945572197437286,
"learning_rate": 7.162946798431605e-06,
"loss": 0.3834,
"step": 586
},
{
"epoch": 1.267818574514039,
"grad_norm": 0.15858979523181915,
"learning_rate": 7.151610251738045e-06,
"loss": 0.3837,
"step": 587
},
{
"epoch": 1.2699784017278617,
"grad_norm": 0.14600925147533417,
"learning_rate": 7.1402601143047514e-06,
"loss": 0.3797,
"step": 588
},
{
"epoch": 1.2721382289416847,
"grad_norm": 0.15963494777679443,
"learning_rate": 7.128896457825364e-06,
"loss": 0.3904,
"step": 589
},
{
"epoch": 1.2742980561555075,
"grad_norm": 0.1409822553396225,
"learning_rate": 7.11751935407891e-06,
"loss": 0.384,
"step": 590
},
{
"epoch": 1.2764578833693305,
"grad_norm": 0.1461641937494278,
"learning_rate": 7.106128874929364e-06,
"loss": 0.3769,
"step": 591
},
{
"epoch": 1.2786177105831533,
"grad_norm": 0.1487351655960083,
"learning_rate": 7.094725092325177e-06,
"loss": 0.3766,
"step": 592
},
{
"epoch": 1.2807775377969763,
"grad_norm": 0.1428721696138382,
"learning_rate": 7.08330807829884e-06,
"loss": 0.3833,
"step": 593
},
{
"epoch": 1.2829373650107991,
"grad_norm": 0.14245618879795074,
"learning_rate": 7.071877904966422e-06,
"loss": 0.382,
"step": 594
},
{
"epoch": 1.285097192224622,
"grad_norm": 0.1549312025308609,
"learning_rate": 7.060434644527105e-06,
"loss": 0.3723,
"step": 595
},
{
"epoch": 1.287257019438445,
"grad_norm": 0.14332742989063263,
"learning_rate": 7.048978369262747e-06,
"loss": 0.385,
"step": 596
},
{
"epoch": 1.2894168466522677,
"grad_norm": 0.15278279781341553,
"learning_rate": 7.037509151537404e-06,
"loss": 0.3715,
"step": 597
},
{
"epoch": 1.2915766738660908,
"grad_norm": 0.14458084106445312,
"learning_rate": 7.026027063796891e-06,
"loss": 0.3708,
"step": 598
},
{
"epoch": 1.2937365010799136,
"grad_norm": 0.15547068417072296,
"learning_rate": 7.014532178568314e-06,
"loss": 0.3784,
"step": 599
},
{
"epoch": 1.2958963282937366,
"grad_norm": 0.15412218868732452,
"learning_rate": 7.003024568459614e-06,
"loss": 0.3785,
"step": 600
},
{
"epoch": 1.2980561555075594,
"grad_norm": 0.15792393684387207,
"learning_rate": 6.991504306159115e-06,
"loss": 0.3912,
"step": 601
},
{
"epoch": 1.3002159827213822,
"grad_norm": 0.1512409746646881,
"learning_rate": 6.9799714644350504e-06,
"loss": 0.3822,
"step": 602
},
{
"epoch": 1.3023758099352052,
"grad_norm": 0.15624138712882996,
"learning_rate": 6.968426116135118e-06,
"loss": 0.3786,
"step": 603
},
{
"epoch": 1.3045356371490282,
"grad_norm": 0.1699935495853424,
"learning_rate": 6.9568683341860135e-06,
"loss": 0.382,
"step": 604
},
{
"epoch": 1.306695464362851,
"grad_norm": 0.1427888125181198,
"learning_rate": 6.945298191592967e-06,
"loss": 0.3694,
"step": 605
},
{
"epoch": 1.3088552915766738,
"grad_norm": 0.15631450712680817,
"learning_rate": 6.93371576143929e-06,
"loss": 0.3846,
"step": 606
},
{
"epoch": 1.3110151187904968,
"grad_norm": 0.15259280800819397,
"learning_rate": 6.922121116885905e-06,
"loss": 0.378,
"step": 607
},
{
"epoch": 1.3131749460043196,
"grad_norm": 0.13901083171367645,
"learning_rate": 6.910514331170888e-06,
"loss": 0.3852,
"step": 608
},
{
"epoch": 1.3153347732181426,
"grad_norm": 0.15216070413589478,
"learning_rate": 6.898895477609007e-06,
"loss": 0.3852,
"step": 609
},
{
"epoch": 1.3174946004319654,
"grad_norm": 0.13873577117919922,
"learning_rate": 6.887264629591254e-06,
"loss": 0.3677,
"step": 610
},
{
"epoch": 1.3196544276457884,
"grad_norm": 0.15047885477542877,
"learning_rate": 6.875621860584389e-06,
"loss": 0.3811,
"step": 611
},
{
"epoch": 1.3218142548596112,
"grad_norm": 0.13761691749095917,
"learning_rate": 6.863967244130467e-06,
"loss": 0.3766,
"step": 612
},
{
"epoch": 1.323974082073434,
"grad_norm": 0.14068377017974854,
"learning_rate": 6.852300853846381e-06,
"loss": 0.3768,
"step": 613
},
{
"epoch": 1.326133909287257,
"grad_norm": 0.14240694046020508,
"learning_rate": 6.840622763423391e-06,
"loss": 0.3804,
"step": 614
},
{
"epoch": 1.3282937365010798,
"grad_norm": 0.14259974658489227,
"learning_rate": 6.8289330466266635e-06,
"loss": 0.3796,
"step": 615
},
{
"epoch": 1.3304535637149029,
"grad_norm": 0.13572795689105988,
"learning_rate": 6.817231777294804e-06,
"loss": 0.3791,
"step": 616
},
{
"epoch": 1.3326133909287257,
"grad_norm": 0.14472903311252594,
"learning_rate": 6.805519029339388e-06,
"loss": 0.3825,
"step": 617
},
{
"epoch": 1.3347732181425487,
"grad_norm": 0.13924075663089752,
"learning_rate": 6.793794876744499e-06,
"loss": 0.3822,
"step": 618
},
{
"epoch": 1.3369330453563715,
"grad_norm": 0.1507209688425064,
"learning_rate": 6.782059393566254e-06,
"loss": 0.3799,
"step": 619
},
{
"epoch": 1.3390928725701943,
"grad_norm": 0.15504410862922668,
"learning_rate": 6.770312653932346e-06,
"loss": 0.396,
"step": 620
},
{
"epoch": 1.3412526997840173,
"grad_norm": 0.14195454120635986,
"learning_rate": 6.758554732041564e-06,
"loss": 0.3797,
"step": 621
},
{
"epoch": 1.3434125269978403,
"grad_norm": 0.158910870552063,
"learning_rate": 6.7467857021633354e-06,
"loss": 0.3923,
"step": 622
},
{
"epoch": 1.345572354211663,
"grad_norm": 0.1433819979429245,
"learning_rate": 6.7350056386372485e-06,
"loss": 0.3819,
"step": 623
},
{
"epoch": 1.347732181425486,
"grad_norm": 0.1474255919456482,
"learning_rate": 6.723214615872585e-06,
"loss": 0.3819,
"step": 624
},
{
"epoch": 1.349892008639309,
"grad_norm": 0.15845805406570435,
"learning_rate": 6.711412708347857e-06,
"loss": 0.39,
"step": 625
},
{
"epoch": 1.3520518358531317,
"grad_norm": 0.12925179302692413,
"learning_rate": 6.699599990610324e-06,
"loss": 0.3779,
"step": 626
},
{
"epoch": 1.3542116630669545,
"grad_norm": 0.1499335616827011,
"learning_rate": 6.68777653727553e-06,
"loss": 0.3804,
"step": 627
},
{
"epoch": 1.3563714902807775,
"grad_norm": 0.15274159610271454,
"learning_rate": 6.675942423026834e-06,
"loss": 0.3783,
"step": 628
},
{
"epoch": 1.3585313174946005,
"grad_norm": 0.13748182356357574,
"learning_rate": 6.664097722614934e-06,
"loss": 0.3735,
"step": 629
},
{
"epoch": 1.3606911447084233,
"grad_norm": 0.14609220623970032,
"learning_rate": 6.652242510857395e-06,
"loss": 0.392,
"step": 630
},
{
"epoch": 1.3628509719222461,
"grad_norm": 0.1698596030473709,
"learning_rate": 6.640376862638176e-06,
"loss": 0.3832,
"step": 631
},
{
"epoch": 1.3650107991360692,
"grad_norm": 0.1435316503047943,
"learning_rate": 6.6285008529071615e-06,
"loss": 0.3819,
"step": 632
},
{
"epoch": 1.367170626349892,
"grad_norm": 0.13530634343624115,
"learning_rate": 6.616614556679684e-06,
"loss": 0.3809,
"step": 633
},
{
"epoch": 1.369330453563715,
"grad_norm": 0.18893133103847504,
"learning_rate": 6.604718049036047e-06,
"loss": 0.3828,
"step": 634
},
{
"epoch": 1.3714902807775378,
"grad_norm": 0.15310388803482056,
"learning_rate": 6.592811405121064e-06,
"loss": 0.3831,
"step": 635
},
{
"epoch": 1.3736501079913608,
"grad_norm": 0.14660876989364624,
"learning_rate": 6.580894700143565e-06,
"loss": 0.3781,
"step": 636
},
{
"epoch": 1.3758099352051836,
"grad_norm": 0.14833448827266693,
"learning_rate": 6.568968009375938e-06,
"loss": 0.3775,
"step": 637
},
{
"epoch": 1.3779697624190064,
"grad_norm": 0.1682375818490982,
"learning_rate": 6.557031408153642e-06,
"loss": 0.3758,
"step": 638
},
{
"epoch": 1.3801295896328294,
"grad_norm": 0.1533748358488083,
"learning_rate": 6.545084971874738e-06,
"loss": 0.3793,
"step": 639
},
{
"epoch": 1.3822894168466522,
"grad_norm": 0.14140866696834564,
"learning_rate": 6.533128775999411e-06,
"loss": 0.384,
"step": 640
},
{
"epoch": 1.3844492440604752,
"grad_norm": 0.14936432242393494,
"learning_rate": 6.521162896049491e-06,
"loss": 0.3891,
"step": 641
},
{
"epoch": 1.386609071274298,
"grad_norm": 0.15731281042099,
"learning_rate": 6.509187407607981e-06,
"loss": 0.3841,
"step": 642
},
{
"epoch": 1.388768898488121,
"grad_norm": 0.15000282227993011,
"learning_rate": 6.497202386318573e-06,
"loss": 0.3851,
"step": 643
},
{
"epoch": 1.3909287257019438,
"grad_norm": 0.14697906374931335,
"learning_rate": 6.485207907885175e-06,
"loss": 0.3773,
"step": 644
},
{
"epoch": 1.3930885529157666,
"grad_norm": 0.1559685468673706,
"learning_rate": 6.473204048071433e-06,
"loss": 0.3821,
"step": 645
},
{
"epoch": 1.3952483801295896,
"grad_norm": 0.15039733052253723,
"learning_rate": 6.4611908827002504e-06,
"loss": 0.3847,
"step": 646
},
{
"epoch": 1.3974082073434126,
"grad_norm": 0.16008631885051727,
"learning_rate": 6.449168487653305e-06,
"loss": 0.3802,
"step": 647
},
{
"epoch": 1.3995680345572354,
"grad_norm": 0.14514020085334778,
"learning_rate": 6.437136938870583e-06,
"loss": 0.3841,
"step": 648
},
{
"epoch": 1.4017278617710582,
"grad_norm": 0.15419939160346985,
"learning_rate": 6.425096312349881e-06,
"loss": 0.3903,
"step": 649
},
{
"epoch": 1.4038876889848813,
"grad_norm": 0.1530395895242691,
"learning_rate": 6.413046684146343e-06,
"loss": 0.3794,
"step": 650
},
{
"epoch": 1.406047516198704,
"grad_norm": 0.15808750689029694,
"learning_rate": 6.400988130371969e-06,
"loss": 0.3766,
"step": 651
},
{
"epoch": 1.408207343412527,
"grad_norm": 0.14891669154167175,
"learning_rate": 6.388920727195138e-06,
"loss": 0.3781,
"step": 652
},
{
"epoch": 1.4103671706263499,
"grad_norm": 0.14925065636634827,
"learning_rate": 6.376844550840126e-06,
"loss": 0.3906,
"step": 653
},
{
"epoch": 1.4125269978401729,
"grad_norm": 0.16382241249084473,
"learning_rate": 6.364759677586627e-06,
"loss": 0.3771,
"step": 654
},
{
"epoch": 1.4146868250539957,
"grad_norm": 0.1546749770641327,
"learning_rate": 6.352666183769269e-06,
"loss": 0.3863,
"step": 655
},
{
"epoch": 1.4168466522678185,
"grad_norm": 0.14334626495838165,
"learning_rate": 6.340564145777131e-06,
"loss": 0.3742,
"step": 656
},
{
"epoch": 1.4190064794816415,
"grad_norm": 0.15877507627010345,
"learning_rate": 6.328453640053264e-06,
"loss": 0.3779,
"step": 657
},
{
"epoch": 1.4211663066954643,
"grad_norm": 0.1556321382522583,
"learning_rate": 6.316334743094201e-06,
"loss": 0.3739,
"step": 658
},
{
"epoch": 1.4233261339092873,
"grad_norm": 0.14519956707954407,
"learning_rate": 6.304207531449486e-06,
"loss": 0.3786,
"step": 659
},
{
"epoch": 1.42548596112311,
"grad_norm": 0.14613112807273865,
"learning_rate": 6.292072081721173e-06,
"loss": 0.381,
"step": 660
},
{
"epoch": 1.4276457883369331,
"grad_norm": 0.15813447535037994,
"learning_rate": 6.279928470563365e-06,
"loss": 0.3866,
"step": 661
},
{
"epoch": 1.429805615550756,
"grad_norm": 0.15421751141548157,
"learning_rate": 6.267776774681703e-06,
"loss": 0.3796,
"step": 662
},
{
"epoch": 1.4319654427645787,
"grad_norm": 0.15891966223716736,
"learning_rate": 6.255617070832908e-06,
"loss": 0.3717,
"step": 663
},
{
"epoch": 1.4341252699784017,
"grad_norm": 0.15779848396778107,
"learning_rate": 6.243449435824276e-06,
"loss": 0.3701,
"step": 664
},
{
"epoch": 1.4362850971922247,
"grad_norm": 0.14361926913261414,
"learning_rate": 6.231273946513201e-06,
"loss": 0.3698,
"step": 665
},
{
"epoch": 1.4384449244060475,
"grad_norm": 0.1553213894367218,
"learning_rate": 6.219090679806694e-06,
"loss": 0.381,
"step": 666
},
{
"epoch": 1.4406047516198703,
"grad_norm": 0.14260952174663544,
"learning_rate": 6.206899712660887e-06,
"loss": 0.3734,
"step": 667
},
{
"epoch": 1.4427645788336934,
"grad_norm": 0.147933229804039,
"learning_rate": 6.1947011220805535e-06,
"loss": 0.3799,
"step": 668
},
{
"epoch": 1.4449244060475162,
"grad_norm": 0.15127696096897125,
"learning_rate": 6.182494985118625e-06,
"loss": 0.3792,
"step": 669
},
{
"epoch": 1.4470842332613392,
"grad_norm": 0.14316388964653015,
"learning_rate": 6.170281378875692e-06,
"loss": 0.3727,
"step": 670
},
{
"epoch": 1.449244060475162,
"grad_norm": 0.14327897131443024,
"learning_rate": 6.158060380499533e-06,
"loss": 0.3823,
"step": 671
},
{
"epoch": 1.451403887688985,
"grad_norm": 0.1345098912715912,
"learning_rate": 6.145832067184614e-06,
"loss": 0.3924,
"step": 672
},
{
"epoch": 1.4535637149028078,
"grad_norm": 0.13999171555042267,
"learning_rate": 6.133596516171609e-06,
"loss": 0.3809,
"step": 673
},
{
"epoch": 1.4557235421166306,
"grad_norm": 0.12059102207422256,
"learning_rate": 6.121353804746907e-06,
"loss": 0.3788,
"step": 674
},
{
"epoch": 1.4578833693304536,
"grad_norm": 0.14187489449977875,
"learning_rate": 6.109104010242127e-06,
"loss": 0.3845,
"step": 675
},
{
"epoch": 1.4600431965442764,
"grad_norm": 0.14432717859745026,
"learning_rate": 6.09684721003363e-06,
"loss": 0.3801,
"step": 676
},
{
"epoch": 1.4622030237580994,
"grad_norm": 0.1373838186264038,
"learning_rate": 6.084583481542028e-06,
"loss": 0.3731,
"step": 677
},
{
"epoch": 1.4643628509719222,
"grad_norm": 0.15109464526176453,
"learning_rate": 6.072312902231692e-06,
"loss": 0.3895,
"step": 678
},
{
"epoch": 1.4665226781857452,
"grad_norm": 0.15525732934474945,
"learning_rate": 6.060035549610275e-06,
"loss": 0.3785,
"step": 679
},
{
"epoch": 1.468682505399568,
"grad_norm": 0.14632560312747955,
"learning_rate": 6.047751501228203e-06,
"loss": 0.3793,
"step": 680
},
{
"epoch": 1.4708423326133908,
"grad_norm": 0.1495695561170578,
"learning_rate": 6.0354608346782075e-06,
"loss": 0.3817,
"step": 681
},
{
"epoch": 1.4730021598272138,
"grad_norm": 0.1651640236377716,
"learning_rate": 6.023163627594813e-06,
"loss": 0.386,
"step": 682
},
{
"epoch": 1.4751619870410368,
"grad_norm": 0.14565086364746094,
"learning_rate": 6.010859957653869e-06,
"loss": 0.3749,
"step": 683
},
{
"epoch": 1.4773218142548596,
"grad_norm": 0.1538180410861969,
"learning_rate": 5.9985499025720354e-06,
"loss": 0.3769,
"step": 684
},
{
"epoch": 1.4794816414686824,
"grad_norm": 0.12949156761169434,
"learning_rate": 5.986233540106315e-06,
"loss": 0.3721,
"step": 685
},
{
"epoch": 1.4816414686825055,
"grad_norm": 0.14916200935840607,
"learning_rate": 5.973910948053545e-06,
"loss": 0.386,
"step": 686
},
{
"epoch": 1.4838012958963283,
"grad_norm": 0.1727481335401535,
"learning_rate": 5.961582204249915e-06,
"loss": 0.3769,
"step": 687
},
{
"epoch": 1.485961123110151,
"grad_norm": 0.1516205221414566,
"learning_rate": 5.949247386570471e-06,
"loss": 0.3865,
"step": 688
},
{
"epoch": 1.488120950323974,
"grad_norm": 0.164317786693573,
"learning_rate": 5.936906572928625e-06,
"loss": 0.3803,
"step": 689
},
{
"epoch": 1.490280777537797,
"grad_norm": 0.15949031710624695,
"learning_rate": 5.924559841275661e-06,
"loss": 0.3819,
"step": 690
},
{
"epoch": 1.4924406047516199,
"grad_norm": 0.13948768377304077,
"learning_rate": 5.912207269600252e-06,
"loss": 0.381,
"step": 691
},
{
"epoch": 1.4946004319654427,
"grad_norm": 0.17031709849834442,
"learning_rate": 5.89984893592795e-06,
"loss": 0.3837,
"step": 692
},
{
"epoch": 1.4967602591792657,
"grad_norm": 0.13423483073711395,
"learning_rate": 5.887484918320708e-06,
"loss": 0.3824,
"step": 693
},
{
"epoch": 1.4989200863930885,
"grad_norm": 0.15468288958072662,
"learning_rate": 5.8751152948763815e-06,
"loss": 0.372,
"step": 694
},
{
"epoch": 1.5010799136069113,
"grad_norm": 0.16139718890190125,
"learning_rate": 5.8627401437282334e-06,
"loss": 0.3775,
"step": 695
},
{
"epoch": 1.5032397408207343,
"grad_norm": 0.15016594529151917,
"learning_rate": 5.850359543044446e-06,
"loss": 0.3781,
"step": 696
},
{
"epoch": 1.5053995680345573,
"grad_norm": 0.1405385285615921,
"learning_rate": 5.837973571027621e-06,
"loss": 0.3789,
"step": 697
},
{
"epoch": 1.5075593952483801,
"grad_norm": 0.15115734934806824,
"learning_rate": 5.82558230591429e-06,
"loss": 0.384,
"step": 698
},
{
"epoch": 1.509719222462203,
"grad_norm": 0.14200249314308167,
"learning_rate": 5.813185825974419e-06,
"loss": 0.3846,
"step": 699
},
{
"epoch": 1.511879049676026,
"grad_norm": 0.147098109126091,
"learning_rate": 5.80078420951091e-06,
"loss": 0.3839,
"step": 700
},
{
"epoch": 1.514038876889849,
"grad_norm": 0.1505637764930725,
"learning_rate": 5.7883775348591146e-06,
"loss": 0.3795,
"step": 701
},
{
"epoch": 1.5161987041036717,
"grad_norm": 0.14681459963321686,
"learning_rate": 5.77596588038633e-06,
"loss": 0.3879,
"step": 702
},
{
"epoch": 1.5183585313174945,
"grad_norm": 0.1480962336063385,
"learning_rate": 5.763549324491317e-06,
"loss": 0.3851,
"step": 703
},
{
"epoch": 1.5205183585313176,
"grad_norm": 0.1447979360818863,
"learning_rate": 5.751127945603786e-06,
"loss": 0.379,
"step": 704
},
{
"epoch": 1.5226781857451404,
"grad_norm": 0.14420117437839508,
"learning_rate": 5.7387018221839195e-06,
"loss": 0.3844,
"step": 705
},
{
"epoch": 1.5248380129589632,
"grad_norm": 0.1400950700044632,
"learning_rate": 5.726271032721864e-06,
"loss": 0.3854,
"step": 706
},
{
"epoch": 1.5269978401727862,
"grad_norm": 0.14526066184043884,
"learning_rate": 5.7138356557372444e-06,
"loss": 0.3815,
"step": 707
},
{
"epoch": 1.5291576673866092,
"grad_norm": 0.15576431155204773,
"learning_rate": 5.70139576977866e-06,
"loss": 0.3866,
"step": 708
},
{
"epoch": 1.531317494600432,
"grad_norm": 0.1484983265399933,
"learning_rate": 5.68895145342319e-06,
"loss": 0.3695,
"step": 709
},
{
"epoch": 1.5334773218142548,
"grad_norm": 0.14368119835853577,
"learning_rate": 5.6765027852759015e-06,
"loss": 0.3751,
"step": 710
},
{
"epoch": 1.5356371490280778,
"grad_norm": 0.14985284209251404,
"learning_rate": 5.664049843969348e-06,
"loss": 0.3759,
"step": 711
},
{
"epoch": 1.5377969762419006,
"grad_norm": 0.14019222557544708,
"learning_rate": 5.651592708163074e-06,
"loss": 0.3768,
"step": 712
},
{
"epoch": 1.5399568034557234,
"grad_norm": 0.15098613500595093,
"learning_rate": 5.639131456543119e-06,
"loss": 0.3755,
"step": 713
},
{
"epoch": 1.5421166306695464,
"grad_norm": 0.13311173021793365,
"learning_rate": 5.626666167821522e-06,
"loss": 0.3727,
"step": 714
},
{
"epoch": 1.5442764578833694,
"grad_norm": 0.1466924548149109,
"learning_rate": 5.614196920735822e-06,
"loss": 0.3816,
"step": 715
},
{
"epoch": 1.5464362850971922,
"grad_norm": 0.14080245792865753,
"learning_rate": 5.601723794048558e-06,
"loss": 0.3808,
"step": 716
},
{
"epoch": 1.548596112311015,
"grad_norm": 0.13415026664733887,
"learning_rate": 5.58924686654678e-06,
"loss": 0.3846,
"step": 717
},
{
"epoch": 1.550755939524838,
"grad_norm": 0.14663489162921906,
"learning_rate": 5.576766217041541e-06,
"loss": 0.3728,
"step": 718
},
{
"epoch": 1.552915766738661,
"grad_norm": 0.14009855687618256,
"learning_rate": 5.5642819243674085e-06,
"loss": 0.3661,
"step": 719
},
{
"epoch": 1.5550755939524838,
"grad_norm": 0.12949179112911224,
"learning_rate": 5.551794067381959e-06,
"loss": 0.3766,
"step": 720
},
{
"epoch": 1.5572354211663066,
"grad_norm": 0.1387072652578354,
"learning_rate": 5.5393027249652844e-06,
"loss": 0.3863,
"step": 721
},
{
"epoch": 1.5593952483801297,
"grad_norm": 0.13852353394031525,
"learning_rate": 5.526807976019492e-06,
"loss": 0.3777,
"step": 722
},
{
"epoch": 1.5615550755939525,
"grad_norm": 0.144193634390831,
"learning_rate": 5.514309899468209e-06,
"loss": 0.3708,
"step": 723
},
{
"epoch": 1.5637149028077753,
"grad_norm": 0.13979433476924896,
"learning_rate": 5.5018085742560745e-06,
"loss": 0.3827,
"step": 724
},
{
"epoch": 1.5658747300215983,
"grad_norm": 0.1412208378314972,
"learning_rate": 5.489304079348259e-06,
"loss": 0.3819,
"step": 725
},
{
"epoch": 1.5680345572354213,
"grad_norm": 0.14092062413692474,
"learning_rate": 5.476796493729943e-06,
"loss": 0.38,
"step": 726
},
{
"epoch": 1.570194384449244,
"grad_norm": 0.15495967864990234,
"learning_rate": 5.46428589640584e-06,
"loss": 0.3941,
"step": 727
},
{
"epoch": 1.5723542116630669,
"grad_norm": 0.14512132108211517,
"learning_rate": 5.451772366399678e-06,
"loss": 0.3912,
"step": 728
},
{
"epoch": 1.57451403887689,
"grad_norm": 0.15392383933067322,
"learning_rate": 5.439255982753717e-06,
"loss": 0.3751,
"step": 729
},
{
"epoch": 1.5766738660907127,
"grad_norm": 0.14311961829662323,
"learning_rate": 5.426736824528236e-06,
"loss": 0.379,
"step": 730
},
{
"epoch": 1.5788336933045355,
"grad_norm": 0.14618806540966034,
"learning_rate": 5.414214970801041e-06,
"loss": 0.3794,
"step": 731
},
{
"epoch": 1.5809935205183585,
"grad_norm": 0.13308942317962646,
"learning_rate": 5.401690500666972e-06,
"loss": 0.3823,
"step": 732
},
{
"epoch": 1.5831533477321815,
"grad_norm": 0.14792852103710175,
"learning_rate": 5.389163493237382e-06,
"loss": 0.379,
"step": 733
},
{
"epoch": 1.5853131749460043,
"grad_norm": 0.15516813099384308,
"learning_rate": 5.376634027639664e-06,
"loss": 0.381,
"step": 734
},
{
"epoch": 1.5874730021598271,
"grad_norm": 0.13429994881153107,
"learning_rate": 5.36410218301673e-06,
"loss": 0.3848,
"step": 735
},
{
"epoch": 1.5896328293736501,
"grad_norm": 0.13838984072208405,
"learning_rate": 5.35156803852652e-06,
"loss": 0.3802,
"step": 736
},
{
"epoch": 1.5917926565874732,
"grad_norm": 0.14528213441371918,
"learning_rate": 5.339031673341505e-06,
"loss": 0.3677,
"step": 737
},
{
"epoch": 1.593952483801296,
"grad_norm": 0.14330457150936127,
"learning_rate": 5.326493166648179e-06,
"loss": 0.3754,
"step": 738
},
{
"epoch": 1.5961123110151187,
"grad_norm": 0.150346577167511,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.3867,
"step": 739
},
{
"epoch": 1.5982721382289418,
"grad_norm": 0.13384312391281128,
"learning_rate": 5.301410045549719e-06,
"loss": 0.3807,
"step": 740
},
{
"epoch": 1.6004319654427646,
"grad_norm": 0.13676618039608002,
"learning_rate": 5.2888655895832075e-06,
"loss": 0.3776,
"step": 741
},
{
"epoch": 1.6025917926565874,
"grad_norm": 0.1279810667037964,
"learning_rate": 5.276319308984637e-06,
"loss": 0.3701,
"step": 742
},
{
"epoch": 1.6047516198704104,
"grad_norm": 0.14258207380771637,
"learning_rate": 5.263771283003133e-06,
"loss": 0.3724,
"step": 743
},
{
"epoch": 1.6069114470842334,
"grad_norm": 0.14498306810855865,
"learning_rate": 5.251221590898848e-06,
"loss": 0.3716,
"step": 744
},
{
"epoch": 1.6090712742980562,
"grad_norm": 0.14295688271522522,
"learning_rate": 5.238670311942459e-06,
"loss": 0.3877,
"step": 745
},
{
"epoch": 1.611231101511879,
"grad_norm": 0.12707604467868805,
"learning_rate": 5.226117525414663e-06,
"loss": 0.3724,
"step": 746
},
{
"epoch": 1.613390928725702,
"grad_norm": 0.14804835617542267,
"learning_rate": 5.213563310605686e-06,
"loss": 0.3827,
"step": 747
},
{
"epoch": 1.6155507559395248,
"grad_norm": 0.12879379093647003,
"learning_rate": 5.201007746814767e-06,
"loss": 0.3706,
"step": 748
},
{
"epoch": 1.6177105831533476,
"grad_norm": 0.14704205095767975,
"learning_rate": 5.188450913349674e-06,
"loss": 0.3869,
"step": 749
},
{
"epoch": 1.6198704103671706,
"grad_norm": 0.1534765362739563,
"learning_rate": 5.175892889526189e-06,
"loss": 0.3736,
"step": 750
},
{
"epoch": 1.6220302375809936,
"grad_norm": 0.1547222137451172,
"learning_rate": 5.16333375466762e-06,
"loss": 0.3796,
"step": 751
},
{
"epoch": 1.6241900647948164,
"grad_norm": 0.13741885125637054,
"learning_rate": 5.150773588104284e-06,
"loss": 0.3817,
"step": 752
},
{
"epoch": 1.6263498920086392,
"grad_norm": 0.13468679785728455,
"learning_rate": 5.138212469173022e-06,
"loss": 0.3781,
"step": 753
},
{
"epoch": 1.6285097192224622,
"grad_norm": 0.1447237730026245,
"learning_rate": 5.1256504772166885e-06,
"loss": 0.3609,
"step": 754
},
{
"epoch": 1.6306695464362853,
"grad_norm": 0.14089703559875488,
"learning_rate": 5.1130876915836495e-06,
"loss": 0.3609,
"step": 755
},
{
"epoch": 1.6328293736501078,
"grad_norm": 0.13828714191913605,
"learning_rate": 5.100524191627289e-06,
"loss": 0.377,
"step": 756
},
{
"epoch": 1.6349892008639308,
"grad_norm": 0.14428219199180603,
"learning_rate": 5.087960056705499e-06,
"loss": 0.3702,
"step": 757
},
{
"epoch": 1.6371490280777539,
"grad_norm": 0.15890643000602722,
"learning_rate": 5.075395366180186e-06,
"loss": 0.3838,
"step": 758
},
{
"epoch": 1.6393088552915767,
"grad_norm": 0.1514156460762024,
"learning_rate": 5.062830199416764e-06,
"loss": 0.3852,
"step": 759
},
{
"epoch": 1.6414686825053995,
"grad_norm": 0.13208477199077606,
"learning_rate": 5.050264635783654e-06,
"loss": 0.3925,
"step": 760
},
{
"epoch": 1.6436285097192225,
"grad_norm": 0.13601286709308624,
"learning_rate": 5.037698754651786e-06,
"loss": 0.3847,
"step": 761
},
{
"epoch": 1.6457883369330455,
"grad_norm": 0.1418139487504959,
"learning_rate": 5.025132635394095e-06,
"loss": 0.3744,
"step": 762
},
{
"epoch": 1.6479481641468683,
"grad_norm": 0.14959508180618286,
"learning_rate": 5.0125663573850204e-06,
"loss": 0.3712,
"step": 763
},
{
"epoch": 1.650107991360691,
"grad_norm": 0.12993188202381134,
"learning_rate": 5e-06,
"loss": 0.38,
"step": 764
},
{
"epoch": 1.652267818574514,
"grad_norm": 0.14041666686534882,
"learning_rate": 4.987433642614981e-06,
"loss": 0.3751,
"step": 765
},
{
"epoch": 1.654427645788337,
"grad_norm": 0.1548304408788681,
"learning_rate": 4.974867364605906e-06,
"loss": 0.3588,
"step": 766
},
{
"epoch": 1.6565874730021597,
"grad_norm": 0.12369633466005325,
"learning_rate": 4.962301245348215e-06,
"loss": 0.3822,
"step": 767
},
{
"epoch": 1.6587473002159827,
"grad_norm": 0.13229462504386902,
"learning_rate": 4.949735364216348e-06,
"loss": 0.3631,
"step": 768
},
{
"epoch": 1.6609071274298057,
"grad_norm": 0.13191936910152435,
"learning_rate": 4.937169800583237e-06,
"loss": 0.3783,
"step": 769
},
{
"epoch": 1.6630669546436285,
"grad_norm": 0.14189469814300537,
"learning_rate": 4.924604633819815e-06,
"loss": 0.3724,
"step": 770
},
{
"epoch": 1.6652267818574513,
"grad_norm": 0.1306021511554718,
"learning_rate": 4.912039943294502e-06,
"loss": 0.3736,
"step": 771
},
{
"epoch": 1.6673866090712743,
"grad_norm": 0.1423332244157791,
"learning_rate": 4.899475808372714e-06,
"loss": 0.3735,
"step": 772
},
{
"epoch": 1.6695464362850974,
"grad_norm": 0.13784444332122803,
"learning_rate": 4.886912308416353e-06,
"loss": 0.3737,
"step": 773
},
{
"epoch": 1.67170626349892,
"grad_norm": 0.13520213961601257,
"learning_rate": 4.874349522783313e-06,
"loss": 0.3678,
"step": 774
},
{
"epoch": 1.673866090712743,
"grad_norm": 0.15318076312541962,
"learning_rate": 4.861787530826979e-06,
"loss": 0.3716,
"step": 775
},
{
"epoch": 1.676025917926566,
"grad_norm": 0.12309125065803528,
"learning_rate": 4.8492264118957165e-06,
"loss": 0.386,
"step": 776
},
{
"epoch": 1.6781857451403888,
"grad_norm": 0.1470710188150406,
"learning_rate": 4.8366662453323826e-06,
"loss": 0.3848,
"step": 777
},
{
"epoch": 1.6803455723542116,
"grad_norm": 0.12907086312770844,
"learning_rate": 4.8241071104738115e-06,
"loss": 0.3689,
"step": 778
},
{
"epoch": 1.6825053995680346,
"grad_norm": 0.13970084488391876,
"learning_rate": 4.811549086650327e-06,
"loss": 0.3814,
"step": 779
},
{
"epoch": 1.6846652267818576,
"grad_norm": 0.13439306616783142,
"learning_rate": 4.798992253185233e-06,
"loss": 0.3717,
"step": 780
},
{
"epoch": 1.6868250539956804,
"grad_norm": 0.13519345223903656,
"learning_rate": 4.786436689394317e-06,
"loss": 0.3765,
"step": 781
},
{
"epoch": 1.6889848812095032,
"grad_norm": 0.13258984684944153,
"learning_rate": 4.773882474585338e-06,
"loss": 0.3809,
"step": 782
},
{
"epoch": 1.6911447084233262,
"grad_norm": 0.12966322898864746,
"learning_rate": 4.761329688057543e-06,
"loss": 0.3782,
"step": 783
},
{
"epoch": 1.693304535637149,
"grad_norm": 0.13643068075180054,
"learning_rate": 4.748778409101153e-06,
"loss": 0.3796,
"step": 784
},
{
"epoch": 1.6954643628509718,
"grad_norm": 0.1507895290851593,
"learning_rate": 4.736228716996868e-06,
"loss": 0.3789,
"step": 785
},
{
"epoch": 1.6976241900647948,
"grad_norm": 0.14031574130058289,
"learning_rate": 4.723680691015366e-06,
"loss": 0.3816,
"step": 786
},
{
"epoch": 1.6997840172786178,
"grad_norm": 0.13055071234703064,
"learning_rate": 4.711134410416794e-06,
"loss": 0.3643,
"step": 787
},
{
"epoch": 1.7019438444924406,
"grad_norm": 0.15579389035701752,
"learning_rate": 4.6985899544502835e-06,
"loss": 0.3797,
"step": 788
},
{
"epoch": 1.7041036717062634,
"grad_norm": 0.1301419883966446,
"learning_rate": 4.686047402353433e-06,
"loss": 0.3793,
"step": 789
},
{
"epoch": 1.7062634989200864,
"grad_norm": 0.13526466488838196,
"learning_rate": 4.673506833351821e-06,
"loss": 0.3911,
"step": 790
},
{
"epoch": 1.7084233261339092,
"grad_norm": 0.1373325139284134,
"learning_rate": 4.660968326658497e-06,
"loss": 0.3774,
"step": 791
},
{
"epoch": 1.710583153347732,
"grad_norm": 0.1474619358778,
"learning_rate": 4.648431961473482e-06,
"loss": 0.368,
"step": 792
},
{
"epoch": 1.712742980561555,
"grad_norm": 0.14143545925617218,
"learning_rate": 4.635897816983272e-06,
"loss": 0.3779,
"step": 793
},
{
"epoch": 1.714902807775378,
"grad_norm": 0.14204931259155273,
"learning_rate": 4.6233659723603374e-06,
"loss": 0.3667,
"step": 794
},
{
"epoch": 1.7170626349892009,
"grad_norm": 0.13979306817054749,
"learning_rate": 4.610836506762618e-06,
"loss": 0.3782,
"step": 795
},
{
"epoch": 1.7192224622030237,
"grad_norm": 0.14510124921798706,
"learning_rate": 4.59830949933303e-06,
"loss": 0.3705,
"step": 796
},
{
"epoch": 1.7213822894168467,
"grad_norm": 0.14503952860832214,
"learning_rate": 4.5857850291989596e-06,
"loss": 0.3804,
"step": 797
},
{
"epoch": 1.7235421166306697,
"grad_norm": 0.13347502052783966,
"learning_rate": 4.573263175471766e-06,
"loss": 0.3706,
"step": 798
},
{
"epoch": 1.7257019438444925,
"grad_norm": 0.12476824969053268,
"learning_rate": 4.560744017246284e-06,
"loss": 0.3756,
"step": 799
},
{
"epoch": 1.7278617710583153,
"grad_norm": 0.13821221888065338,
"learning_rate": 4.548227633600322e-06,
"loss": 0.3802,
"step": 800
},
{
"epoch": 1.7300215982721383,
"grad_norm": 0.13669945299625397,
"learning_rate": 4.535714103594162e-06,
"loss": 0.3818,
"step": 801
},
{
"epoch": 1.732181425485961,
"grad_norm": 0.1308770775794983,
"learning_rate": 4.523203506270058e-06,
"loss": 0.3836,
"step": 802
},
{
"epoch": 1.734341252699784,
"grad_norm": 0.1351071000099182,
"learning_rate": 4.510695920651742e-06,
"loss": 0.3757,
"step": 803
},
{
"epoch": 1.736501079913607,
"grad_norm": 0.1277286410331726,
"learning_rate": 4.4981914257439254e-06,
"loss": 0.387,
"step": 804
},
{
"epoch": 1.73866090712743,
"grad_norm": 0.1272444725036621,
"learning_rate": 4.485690100531793e-06,
"loss": 0.3829,
"step": 805
},
{
"epoch": 1.7408207343412527,
"grad_norm": 0.14064733684062958,
"learning_rate": 4.473192023980509e-06,
"loss": 0.3822,
"step": 806
},
{
"epoch": 1.7429805615550755,
"grad_norm": 0.13635459542274475,
"learning_rate": 4.460697275034717e-06,
"loss": 0.38,
"step": 807
},
{
"epoch": 1.7451403887688985,
"grad_norm": 0.136144757270813,
"learning_rate": 4.448205932618042e-06,
"loss": 0.3794,
"step": 808
},
{
"epoch": 1.7473002159827213,
"grad_norm": 0.14044472575187683,
"learning_rate": 4.4357180756325915e-06,
"loss": 0.3741,
"step": 809
},
{
"epoch": 1.7494600431965441,
"grad_norm": 0.13555637001991272,
"learning_rate": 4.423233782958459e-06,
"loss": 0.369,
"step": 810
},
{
"epoch": 1.7516198704103672,
"grad_norm": 0.1326342225074768,
"learning_rate": 4.410753133453222e-06,
"loss": 0.3784,
"step": 811
},
{
"epoch": 1.7537796976241902,
"grad_norm": 0.13601535558700562,
"learning_rate": 4.398276205951443e-06,
"loss": 0.3821,
"step": 812
},
{
"epoch": 1.755939524838013,
"grad_norm": 0.13336274027824402,
"learning_rate": 4.38580307926418e-06,
"loss": 0.3713,
"step": 813
},
{
"epoch": 1.7580993520518358,
"grad_norm": 0.14658118784427643,
"learning_rate": 4.373333832178478e-06,
"loss": 0.3825,
"step": 814
},
{
"epoch": 1.7602591792656588,
"grad_norm": 0.14051300287246704,
"learning_rate": 4.360868543456883e-06,
"loss": 0.3685,
"step": 815
},
{
"epoch": 1.7624190064794818,
"grad_norm": 0.1231897696852684,
"learning_rate": 4.348407291836928e-06,
"loss": 0.37,
"step": 816
},
{
"epoch": 1.7645788336933044,
"grad_norm": 0.13528205454349518,
"learning_rate": 4.335950156030653e-06,
"loss": 0.3855,
"step": 817
},
{
"epoch": 1.7667386609071274,
"grad_norm": 0.14070774614810944,
"learning_rate": 4.323497214724099e-06,
"loss": 0.3752,
"step": 818
},
{
"epoch": 1.7688984881209504,
"grad_norm": 0.1284414529800415,
"learning_rate": 4.31104854657681e-06,
"loss": 0.3659,
"step": 819
},
{
"epoch": 1.7710583153347732,
"grad_norm": 0.13247400522232056,
"learning_rate": 4.298604230221341e-06,
"loss": 0.3727,
"step": 820
},
{
"epoch": 1.773218142548596,
"grad_norm": 0.12880460917949677,
"learning_rate": 4.286164344262756e-06,
"loss": 0.3867,
"step": 821
},
{
"epoch": 1.775377969762419,
"grad_norm": 0.12950289249420166,
"learning_rate": 4.273728967278137e-06,
"loss": 0.3685,
"step": 822
},
{
"epoch": 1.777537796976242,
"grad_norm": 0.1209382489323616,
"learning_rate": 4.261298177816082e-06,
"loss": 0.3658,
"step": 823
},
{
"epoch": 1.7796976241900648,
"grad_norm": 0.1271076798439026,
"learning_rate": 4.248872054396215e-06,
"loss": 0.3801,
"step": 824
},
{
"epoch": 1.7818574514038876,
"grad_norm": 0.1265021562576294,
"learning_rate": 4.2364506755086856e-06,
"loss": 0.3719,
"step": 825
},
{
"epoch": 1.7840172786177106,
"grad_norm": 0.12241175025701523,
"learning_rate": 4.224034119613671e-06,
"loss": 0.3744,
"step": 826
},
{
"epoch": 1.7861771058315334,
"grad_norm": 0.12162206321954727,
"learning_rate": 4.211622465140887e-06,
"loss": 0.3797,
"step": 827
},
{
"epoch": 1.7883369330453562,
"grad_norm": 0.12623707950115204,
"learning_rate": 4.199215790489091e-06,
"loss": 0.3859,
"step": 828
},
{
"epoch": 1.7904967602591793,
"grad_norm": 0.13991111516952515,
"learning_rate": 4.186814174025582e-06,
"loss": 0.3736,
"step": 829
},
{
"epoch": 1.7926565874730023,
"grad_norm": 0.12621738016605377,
"learning_rate": 4.174417694085711e-06,
"loss": 0.3743,
"step": 830
},
{
"epoch": 1.794816414686825,
"grad_norm": 0.12811291217803955,
"learning_rate": 4.16202642897238e-06,
"loss": 0.3782,
"step": 831
},
{
"epoch": 1.7969762419006479,
"grad_norm": 0.12236473709344864,
"learning_rate": 4.149640456955555e-06,
"loss": 0.3764,
"step": 832
},
{
"epoch": 1.7991360691144709,
"grad_norm": 0.142435684800148,
"learning_rate": 4.137259856271767e-06,
"loss": 0.3719,
"step": 833
},
{
"epoch": 1.801295896328294,
"grad_norm": 0.12946586310863495,
"learning_rate": 4.124884705123619e-06,
"loss": 0.3852,
"step": 834
},
{
"epoch": 1.8034557235421165,
"grad_norm": 0.1189626008272171,
"learning_rate": 4.112515081679295e-06,
"loss": 0.3751,
"step": 835
},
{
"epoch": 1.8056155507559395,
"grad_norm": 0.13230590522289276,
"learning_rate": 4.1001510640720525e-06,
"loss": 0.3688,
"step": 836
},
{
"epoch": 1.8077753779697625,
"grad_norm": 0.13355307281017303,
"learning_rate": 4.087792730399749e-06,
"loss": 0.3885,
"step": 837
},
{
"epoch": 1.8099352051835853,
"grad_norm": 0.13115477561950684,
"learning_rate": 4.075440158724339e-06,
"loss": 0.3807,
"step": 838
},
{
"epoch": 1.812095032397408,
"grad_norm": 0.11709022521972656,
"learning_rate": 4.063093427071376e-06,
"loss": 0.3715,
"step": 839
},
{
"epoch": 1.8142548596112311,
"grad_norm": 0.13939060270786285,
"learning_rate": 4.0507526134295314e-06,
"loss": 0.3718,
"step": 840
},
{
"epoch": 1.8164146868250541,
"grad_norm": 0.13002587854862213,
"learning_rate": 4.038417795750086e-06,
"loss": 0.378,
"step": 841
},
{
"epoch": 1.818574514038877,
"grad_norm": 0.13568507134914398,
"learning_rate": 4.0260890519464565e-06,
"loss": 0.3715,
"step": 842
},
{
"epoch": 1.8207343412526997,
"grad_norm": 0.13115161657333374,
"learning_rate": 4.013766459893686e-06,
"loss": 0.374,
"step": 843
},
{
"epoch": 1.8228941684665227,
"grad_norm": 0.1360725313425064,
"learning_rate": 4.001450097427965e-06,
"loss": 0.3915,
"step": 844
},
{
"epoch": 1.8250539956803455,
"grad_norm": 0.14375773072242737,
"learning_rate": 3.989140042346134e-06,
"loss": 0.3823,
"step": 845
},
{
"epoch": 1.8272138228941683,
"grad_norm": 0.14056192338466644,
"learning_rate": 3.9768363724051875e-06,
"loss": 0.3797,
"step": 846
},
{
"epoch": 1.8293736501079914,
"grad_norm": 0.13145223259925842,
"learning_rate": 3.964539165321795e-06,
"loss": 0.3651,
"step": 847
},
{
"epoch": 1.8315334773218144,
"grad_norm": 0.1401747614145279,
"learning_rate": 3.952248498771797e-06,
"loss": 0.3803,
"step": 848
},
{
"epoch": 1.8336933045356372,
"grad_norm": 0.1457161009311676,
"learning_rate": 3.939964450389728e-06,
"loss": 0.3875,
"step": 849
},
{
"epoch": 1.83585313174946,
"grad_norm": 0.1399625837802887,
"learning_rate": 3.927687097768309e-06,
"loss": 0.3855,
"step": 850
},
{
"epoch": 1.838012958963283,
"grad_norm": 0.12442053109407425,
"learning_rate": 3.915416518457974e-06,
"loss": 0.3885,
"step": 851
},
{
"epoch": 1.8401727861771058,
"grad_norm": 0.12682035565376282,
"learning_rate": 3.9031527899663705e-06,
"loss": 0.3708,
"step": 852
},
{
"epoch": 1.8423326133909286,
"grad_norm": 0.12829378247261047,
"learning_rate": 3.890895989757874e-06,
"loss": 0.376,
"step": 853
},
{
"epoch": 1.8444924406047516,
"grad_norm": 0.14053881168365479,
"learning_rate": 3.8786461952530955e-06,
"loss": 0.373,
"step": 854
},
{
"epoch": 1.8466522678185746,
"grad_norm": 0.1281130015850067,
"learning_rate": 3.866403483828392e-06,
"loss": 0.3773,
"step": 855
},
{
"epoch": 1.8488120950323974,
"grad_norm": 0.12932966649532318,
"learning_rate": 3.854167932815387e-06,
"loss": 0.383,
"step": 856
},
{
"epoch": 1.8509719222462202,
"grad_norm": 0.1419646143913269,
"learning_rate": 3.841939619500468e-06,
"loss": 0.3674,
"step": 857
},
{
"epoch": 1.8531317494600432,
"grad_norm": 0.12675082683563232,
"learning_rate": 3.8297186211243085e-06,
"loss": 0.3814,
"step": 858
},
{
"epoch": 1.8552915766738662,
"grad_norm": 0.11979357898235321,
"learning_rate": 3.817505014881378e-06,
"loss": 0.38,
"step": 859
},
{
"epoch": 1.857451403887689,
"grad_norm": 0.13714280724525452,
"learning_rate": 3.8052988779194478e-06,
"loss": 0.3823,
"step": 860
},
{
"epoch": 1.8596112311015118,
"grad_norm": 0.13954536616802216,
"learning_rate": 3.7931002873391156e-06,
"loss": 0.3796,
"step": 861
},
{
"epoch": 1.8617710583153348,
"grad_norm": 0.11842264980077744,
"learning_rate": 3.7809093201933078e-06,
"loss": 0.3761,
"step": 862
},
{
"epoch": 1.8639308855291576,
"grad_norm": 0.12878850102424622,
"learning_rate": 3.7687260534868e-06,
"loss": 0.3821,
"step": 863
},
{
"epoch": 1.8660907127429804,
"grad_norm": 0.14155155420303345,
"learning_rate": 3.756550564175727e-06,
"loss": 0.3748,
"step": 864
},
{
"epoch": 1.8682505399568035,
"grad_norm": 0.12996132671833038,
"learning_rate": 3.744382929167094e-06,
"loss": 0.3741,
"step": 865
},
{
"epoch": 1.8704103671706265,
"grad_norm": 0.12898430228233337,
"learning_rate": 3.7322232253182984e-06,
"loss": 0.3763,
"step": 866
},
{
"epoch": 1.8725701943844493,
"grad_norm": 0.12044209241867065,
"learning_rate": 3.7200715294366376e-06,
"loss": 0.3747,
"step": 867
},
{
"epoch": 1.874730021598272,
"grad_norm": 0.12121162563562393,
"learning_rate": 3.7079279182788263e-06,
"loss": 0.381,
"step": 868
},
{
"epoch": 1.876889848812095,
"grad_norm": 0.12106562405824661,
"learning_rate": 3.695792468550517e-06,
"loss": 0.3767,
"step": 869
},
{
"epoch": 1.8790496760259179,
"grad_norm": 0.118615061044693,
"learning_rate": 3.6836652569057994e-06,
"loss": 0.3708,
"step": 870
},
{
"epoch": 1.8812095032397407,
"grad_norm": 0.12058837711811066,
"learning_rate": 3.6715463599467372e-06,
"loss": 0.3778,
"step": 871
},
{
"epoch": 1.8833693304535637,
"grad_norm": 0.12485583126544952,
"learning_rate": 3.659435854222869e-06,
"loss": 0.3679,
"step": 872
},
{
"epoch": 1.8855291576673867,
"grad_norm": 0.11957580596208572,
"learning_rate": 3.6473338162307314e-06,
"loss": 0.3709,
"step": 873
},
{
"epoch": 1.8876889848812095,
"grad_norm": 0.12649306654930115,
"learning_rate": 3.635240322413375e-06,
"loss": 0.3803,
"step": 874
},
{
"epoch": 1.8898488120950323,
"grad_norm": 0.12188448011875153,
"learning_rate": 3.6231554491598766e-06,
"loss": 0.3753,
"step": 875
},
{
"epoch": 1.8920086393088553,
"grad_norm": 0.12264645844697952,
"learning_rate": 3.6110792728048636e-06,
"loss": 0.3736,
"step": 876
},
{
"epoch": 1.8941684665226783,
"grad_norm": 0.12633198499679565,
"learning_rate": 3.599011869628033e-06,
"loss": 0.3734,
"step": 877
},
{
"epoch": 1.896328293736501,
"grad_norm": 0.12245716154575348,
"learning_rate": 3.5869533158536583e-06,
"loss": 0.3661,
"step": 878
},
{
"epoch": 1.898488120950324,
"grad_norm": 0.11652278900146484,
"learning_rate": 3.5749036876501196e-06,
"loss": 0.3775,
"step": 879
},
{
"epoch": 1.900647948164147,
"grad_norm": 0.1323150098323822,
"learning_rate": 3.562863061129419e-06,
"loss": 0.3736,
"step": 880
},
{
"epoch": 1.9028077753779697,
"grad_norm": 0.1193445473909378,
"learning_rate": 3.550831512346695e-06,
"loss": 0.3756,
"step": 881
},
{
"epoch": 1.9049676025917925,
"grad_norm": 0.11626636981964111,
"learning_rate": 3.538809117299751e-06,
"loss": 0.3771,
"step": 882
},
{
"epoch": 1.9071274298056156,
"grad_norm": 0.13352340459823608,
"learning_rate": 3.526795951928569e-06,
"loss": 0.3828,
"step": 883
},
{
"epoch": 1.9092872570194386,
"grad_norm": 0.13351115584373474,
"learning_rate": 3.5147920921148267e-06,
"loss": 0.3645,
"step": 884
},
{
"epoch": 1.9114470842332614,
"grad_norm": 0.11943700909614563,
"learning_rate": 3.502797613681429e-06,
"loss": 0.386,
"step": 885
},
{
"epoch": 1.9136069114470842,
"grad_norm": 0.1416894644498825,
"learning_rate": 3.4908125923920204e-06,
"loss": 0.3771,
"step": 886
},
{
"epoch": 1.9157667386609072,
"grad_norm": 0.12883397936820984,
"learning_rate": 3.478837103950509e-06,
"loss": 0.38,
"step": 887
},
{
"epoch": 1.91792656587473,
"grad_norm": 0.12375036627054214,
"learning_rate": 3.4668712240005912e-06,
"loss": 0.3771,
"step": 888
},
{
"epoch": 1.9200863930885528,
"grad_norm": 0.13057532906532288,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.3867,
"step": 889
},
{
"epoch": 1.9222462203023758,
"grad_norm": 0.13063670694828033,
"learning_rate": 3.442968591846359e-06,
"loss": 0.3746,
"step": 890
},
{
"epoch": 1.9244060475161988,
"grad_norm": 0.11987043917179108,
"learning_rate": 3.431031990624063e-06,
"loss": 0.3733,
"step": 891
},
{
"epoch": 1.9265658747300216,
"grad_norm": 0.13609716296195984,
"learning_rate": 3.4191052998564344e-06,
"loss": 0.3766,
"step": 892
},
{
"epoch": 1.9287257019438444,
"grad_norm": 0.1286943554878235,
"learning_rate": 3.407188594878938e-06,
"loss": 0.3777,
"step": 893
},
{
"epoch": 1.9308855291576674,
"grad_norm": 0.15128813683986664,
"learning_rate": 3.3952819509639534e-06,
"loss": 0.3729,
"step": 894
},
{
"epoch": 1.9330453563714904,
"grad_norm": 0.12446796149015427,
"learning_rate": 3.3833854433203185e-06,
"loss": 0.3773,
"step": 895
},
{
"epoch": 1.935205183585313,
"grad_norm": 0.1316557675600052,
"learning_rate": 3.3714991470928393e-06,
"loss": 0.3843,
"step": 896
},
{
"epoch": 1.937365010799136,
"grad_norm": 0.12782582640647888,
"learning_rate": 3.359623137361825e-06,
"loss": 0.3787,
"step": 897
},
{
"epoch": 1.939524838012959,
"grad_norm": 0.13275963068008423,
"learning_rate": 3.347757489142608e-06,
"loss": 0.3809,
"step": 898
},
{
"epoch": 1.9416846652267818,
"grad_norm": 0.1355566680431366,
"learning_rate": 3.3359022773850673e-06,
"loss": 0.3798,
"step": 899
},
{
"epoch": 1.9438444924406046,
"grad_norm": 0.12694980204105377,
"learning_rate": 3.3240575769731662e-06,
"loss": 0.3825,
"step": 900
},
{
"epoch": 1.9460043196544277,
"grad_norm": 0.13038381934165955,
"learning_rate": 3.312223462724472e-06,
"loss": 0.3861,
"step": 901
},
{
"epoch": 1.9481641468682507,
"grad_norm": 0.1373014897108078,
"learning_rate": 3.300400009389678e-06,
"loss": 0.3828,
"step": 902
},
{
"epoch": 1.9503239740820735,
"grad_norm": 0.1347927749156952,
"learning_rate": 3.2885872916521445e-06,
"loss": 0.3701,
"step": 903
},
{
"epoch": 1.9524838012958963,
"grad_norm": 0.12417130172252655,
"learning_rate": 3.2767853841274154e-06,
"loss": 0.3823,
"step": 904
},
{
"epoch": 1.9546436285097193,
"grad_norm": 0.1381063610315323,
"learning_rate": 3.264994361362753e-06,
"loss": 0.3768,
"step": 905
},
{
"epoch": 1.956803455723542,
"grad_norm": 0.12305869907140732,
"learning_rate": 3.2532142978366654e-06,
"loss": 0.3803,
"step": 906
},
{
"epoch": 1.9589632829373649,
"grad_norm": 0.12444626539945602,
"learning_rate": 3.241445267958438e-06,
"loss": 0.3717,
"step": 907
},
{
"epoch": 1.961123110151188,
"grad_norm": 0.12498262524604797,
"learning_rate": 3.2296873460676557e-06,
"loss": 0.3739,
"step": 908
},
{
"epoch": 1.963282937365011,
"grad_norm": 0.11730080097913742,
"learning_rate": 3.217940606433747e-06,
"loss": 0.379,
"step": 909
},
{
"epoch": 1.9654427645788337,
"grad_norm": 0.13885721564292908,
"learning_rate": 3.2062051232555024e-06,
"loss": 0.3693,
"step": 910
},
{
"epoch": 1.9676025917926565,
"grad_norm": 0.12217065691947937,
"learning_rate": 3.1944809706606123e-06,
"loss": 0.3739,
"step": 911
},
{
"epoch": 1.9697624190064795,
"grad_norm": 0.1336861401796341,
"learning_rate": 3.182768222705198e-06,
"loss": 0.3747,
"step": 912
},
{
"epoch": 1.9719222462203023,
"grad_norm": 0.12711098790168762,
"learning_rate": 3.171066953373338e-06,
"loss": 0.3821,
"step": 913
},
{
"epoch": 1.9740820734341251,
"grad_norm": 0.1329392045736313,
"learning_rate": 3.1593772365766107e-06,
"loss": 0.376,
"step": 914
},
{
"epoch": 1.9762419006479481,
"grad_norm": 0.12040119618177414,
"learning_rate": 3.147699146153621e-06,
"loss": 0.3738,
"step": 915
},
{
"epoch": 1.9784017278617712,
"grad_norm": 0.23164218664169312,
"learning_rate": 3.1360327558695336e-06,
"loss": 0.3802,
"step": 916
},
{
"epoch": 1.980561555075594,
"grad_norm": 0.11707053333520889,
"learning_rate": 3.1243781394156138e-06,
"loss": 0.3813,
"step": 917
},
{
"epoch": 1.9827213822894167,
"grad_norm": 0.1273183971643448,
"learning_rate": 3.1127353704087477e-06,
"loss": 0.3779,
"step": 918
},
{
"epoch": 1.9848812095032398,
"grad_norm": 0.1234814003109932,
"learning_rate": 3.1011045223909954e-06,
"loss": 0.3804,
"step": 919
},
{
"epoch": 1.9870410367170628,
"grad_norm": 0.1258484572172165,
"learning_rate": 3.089485668829113e-06,
"loss": 0.3811,
"step": 920
},
{
"epoch": 1.9892008639308856,
"grad_norm": 0.1231926903128624,
"learning_rate": 3.077878883114096e-06,
"loss": 0.3831,
"step": 921
},
{
"epoch": 1.9913606911447084,
"grad_norm": 0.12332677841186523,
"learning_rate": 3.066284238560713e-06,
"loss": 0.3698,
"step": 922
},
{
"epoch": 1.9935205183585314,
"grad_norm": 0.12334899604320526,
"learning_rate": 3.0547018084070344e-06,
"loss": 0.3768,
"step": 923
},
{
"epoch": 1.9956803455723542,
"grad_norm": 0.11958076804876328,
"learning_rate": 3.043131665813988e-06,
"loss": 0.3684,
"step": 924
},
{
"epoch": 1.997840172786177,
"grad_norm": 0.12707985937595367,
"learning_rate": 3.031573883864882e-06,
"loss": 0.382,
"step": 925
},
{
"epoch": 2.0,
"grad_norm": 0.14072422683238983,
"learning_rate": 3.0200285355649504e-06,
"loss": 0.3729,
"step": 926
},
{
"epoch": 2.002159827213823,
"grad_norm": 0.1437283754348755,
"learning_rate": 3.0084956938408873e-06,
"loss": 0.3623,
"step": 927
},
{
"epoch": 2.0043196544276456,
"grad_norm": 0.12962134182453156,
"learning_rate": 2.9969754315403865e-06,
"loss": 0.3649,
"step": 928
},
{
"epoch": 2.0064794816414686,
"grad_norm": 0.13019190728664398,
"learning_rate": 2.9854678214316875e-06,
"loss": 0.3626,
"step": 929
},
{
"epoch": 2.0086393088552916,
"grad_norm": 0.12730036675930023,
"learning_rate": 2.97397293620311e-06,
"loss": 0.3572,
"step": 930
},
{
"epoch": 2.0107991360691146,
"grad_norm": 0.13056515157222748,
"learning_rate": 2.962490848462596e-06,
"loss": 0.3474,
"step": 931
},
{
"epoch": 2.012958963282937,
"grad_norm": 0.13034315407276154,
"learning_rate": 2.951021630737255e-06,
"loss": 0.3679,
"step": 932
},
{
"epoch": 2.0151187904967602,
"grad_norm": 0.14478375017642975,
"learning_rate": 2.9395653554728955e-06,
"loss": 0.3579,
"step": 933
},
{
"epoch": 2.0172786177105833,
"grad_norm": 0.1417471021413803,
"learning_rate": 2.92812209503358e-06,
"loss": 0.365,
"step": 934
},
{
"epoch": 2.019438444924406,
"grad_norm": 0.13237528502941132,
"learning_rate": 2.91669192170116e-06,
"loss": 0.3658,
"step": 935
},
{
"epoch": 2.021598272138229,
"grad_norm": 0.1314917653799057,
"learning_rate": 2.9052749076748266e-06,
"loss": 0.3687,
"step": 936
},
{
"epoch": 2.023758099352052,
"grad_norm": 0.11867891997098923,
"learning_rate": 2.8938711250706397e-06,
"loss": 0.3643,
"step": 937
},
{
"epoch": 2.025917926565875,
"grad_norm": 0.13976238667964935,
"learning_rate": 2.8824806459210907e-06,
"loss": 0.3678,
"step": 938
},
{
"epoch": 2.0280777537796975,
"grad_norm": 0.1445903778076172,
"learning_rate": 2.871103542174637e-06,
"loss": 0.3574,
"step": 939
},
{
"epoch": 2.0302375809935205,
"grad_norm": 0.11427946388721466,
"learning_rate": 2.8597398856952473e-06,
"loss": 0.3569,
"step": 940
},
{
"epoch": 2.0323974082073435,
"grad_norm": 0.13496039807796478,
"learning_rate": 2.8483897482619566e-06,
"loss": 0.3717,
"step": 941
},
{
"epoch": 2.0345572354211665,
"grad_norm": 0.13540537655353546,
"learning_rate": 2.837053201568396e-06,
"loss": 0.3667,
"step": 942
},
{
"epoch": 2.036717062634989,
"grad_norm": 0.12281273305416107,
"learning_rate": 2.825730317222358e-06,
"loss": 0.3541,
"step": 943
},
{
"epoch": 2.038876889848812,
"grad_norm": 0.12640658020973206,
"learning_rate": 2.814421166745337e-06,
"loss": 0.3641,
"step": 944
},
{
"epoch": 2.041036717062635,
"grad_norm": 0.12110286951065063,
"learning_rate": 2.803125821572068e-06,
"loss": 0.3597,
"step": 945
},
{
"epoch": 2.0431965442764577,
"grad_norm": 0.13443802297115326,
"learning_rate": 2.791844353050094e-06,
"loss": 0.3709,
"step": 946
},
{
"epoch": 2.0453563714902807,
"grad_norm": 0.11386435478925705,
"learning_rate": 2.7805768324393017e-06,
"loss": 0.3681,
"step": 947
},
{
"epoch": 2.0475161987041037,
"grad_norm": 0.13114500045776367,
"learning_rate": 2.769323330911472e-06,
"loss": 0.3602,
"step": 948
},
{
"epoch": 2.0496760259179267,
"grad_norm": 0.13121016323566437,
"learning_rate": 2.7580839195498397e-06,
"loss": 0.3567,
"step": 949
},
{
"epoch": 2.0518358531317493,
"grad_norm": 0.11939337104558945,
"learning_rate": 2.746858669348634e-06,
"loss": 0.3611,
"step": 950
},
{
"epoch": 2.0539956803455723,
"grad_norm": 0.11663561314344406,
"learning_rate": 2.7356476512126386e-06,
"loss": 0.3557,
"step": 951
},
{
"epoch": 2.0561555075593954,
"grad_norm": 0.11576730012893677,
"learning_rate": 2.724450935956733e-06,
"loss": 0.3723,
"step": 952
},
{
"epoch": 2.058315334773218,
"grad_norm": 0.1251356601715088,
"learning_rate": 2.713268594305458e-06,
"loss": 0.3637,
"step": 953
},
{
"epoch": 2.060475161987041,
"grad_norm": 0.12093979120254517,
"learning_rate": 2.7021006968925613e-06,
"loss": 0.3752,
"step": 954
},
{
"epoch": 2.062634989200864,
"grad_norm": 0.12214156985282898,
"learning_rate": 2.6909473142605522e-06,
"loss": 0.3638,
"step": 955
},
{
"epoch": 2.064794816414687,
"grad_norm": 0.12628315389156342,
"learning_rate": 2.6798085168602595e-06,
"loss": 0.3667,
"step": 956
},
{
"epoch": 2.0669546436285096,
"grad_norm": 0.12237667292356491,
"learning_rate": 2.668684375050378e-06,
"loss": 0.3653,
"step": 957
},
{
"epoch": 2.0691144708423326,
"grad_norm": 0.1107870489358902,
"learning_rate": 2.6575749590970336e-06,
"loss": 0.3558,
"step": 958
},
{
"epoch": 2.0712742980561556,
"grad_norm": 0.1208115965127945,
"learning_rate": 2.646480339173337e-06,
"loss": 0.3733,
"step": 959
},
{
"epoch": 2.0734341252699786,
"grad_norm": 0.12692323327064514,
"learning_rate": 2.635400585358937e-06,
"loss": 0.3663,
"step": 960
},
{
"epoch": 2.075593952483801,
"grad_norm": 0.11760963499546051,
"learning_rate": 2.624335767639582e-06,
"loss": 0.3638,
"step": 961
},
{
"epoch": 2.077753779697624,
"grad_norm": 0.12751303613185883,
"learning_rate": 2.6132859559066704e-06,
"loss": 0.3547,
"step": 962
},
{
"epoch": 2.079913606911447,
"grad_norm": 0.12997639179229736,
"learning_rate": 2.6022512199568205e-06,
"loss": 0.3558,
"step": 963
},
{
"epoch": 2.08207343412527,
"grad_norm": 0.12291760742664337,
"learning_rate": 2.5912316294914232e-06,
"loss": 0.3506,
"step": 964
},
{
"epoch": 2.084233261339093,
"grad_norm": 0.11633151024580002,
"learning_rate": 2.580227254116199e-06,
"loss": 0.3648,
"step": 965
},
{
"epoch": 2.086393088552916,
"grad_norm": 0.12379375100135803,
"learning_rate": 2.5692381633407672e-06,
"loss": 0.3652,
"step": 966
},
{
"epoch": 2.088552915766739,
"grad_norm": 0.12270376831293106,
"learning_rate": 2.558264426578192e-06,
"loss": 0.3625,
"step": 967
},
{
"epoch": 2.0907127429805614,
"grad_norm": 0.12057667225599289,
"learning_rate": 2.547306113144564e-06,
"loss": 0.3712,
"step": 968
},
{
"epoch": 2.0928725701943844,
"grad_norm": 0.1182745024561882,
"learning_rate": 2.536363292258543e-06,
"loss": 0.3686,
"step": 969
},
{
"epoch": 2.0950323974082075,
"grad_norm": 0.12089554965496063,
"learning_rate": 2.5254360330409343e-06,
"loss": 0.3603,
"step": 970
},
{
"epoch": 2.09719222462203,
"grad_norm": 0.12302310764789581,
"learning_rate": 2.514524404514248e-06,
"loss": 0.3599,
"step": 971
},
{
"epoch": 2.099352051835853,
"grad_norm": 0.1283075213432312,
"learning_rate": 2.503628475602256e-06,
"loss": 0.3685,
"step": 972
},
{
"epoch": 2.101511879049676,
"grad_norm": 0.11500417441129684,
"learning_rate": 2.49274831512957e-06,
"loss": 0.3657,
"step": 973
},
{
"epoch": 2.103671706263499,
"grad_norm": 0.11335953325033188,
"learning_rate": 2.4818839918211963e-06,
"loss": 0.3689,
"step": 974
},
{
"epoch": 2.1058315334773217,
"grad_norm": 0.12606894969940186,
"learning_rate": 2.4710355743021077e-06,
"loss": 0.359,
"step": 975
},
{
"epoch": 2.1079913606911447,
"grad_norm": 0.11400944739580154,
"learning_rate": 2.4602031310968013e-06,
"loss": 0.3661,
"step": 976
},
{
"epoch": 2.1101511879049677,
"grad_norm": 0.11969246715307236,
"learning_rate": 2.4493867306288772e-06,
"loss": 0.3618,
"step": 977
},
{
"epoch": 2.1123110151187907,
"grad_norm": 0.11956711113452911,
"learning_rate": 2.4385864412206e-06,
"loss": 0.3516,
"step": 978
},
{
"epoch": 2.1144708423326133,
"grad_norm": 0.11470730602741241,
"learning_rate": 2.4278023310924676e-06,
"loss": 0.3651,
"step": 979
},
{
"epoch": 2.1166306695464363,
"grad_norm": 0.12043334543704987,
"learning_rate": 2.417034468362782e-06,
"loss": 0.3702,
"step": 980
},
{
"epoch": 2.1187904967602593,
"grad_norm": 0.11915960907936096,
"learning_rate": 2.406282921047213e-06,
"loss": 0.3609,
"step": 981
},
{
"epoch": 2.120950323974082,
"grad_norm": 0.1116413027048111,
"learning_rate": 2.395547757058379e-06,
"loss": 0.3576,
"step": 982
},
{
"epoch": 2.123110151187905,
"grad_norm": 0.11029747128486633,
"learning_rate": 2.3848290442054096e-06,
"loss": 0.3618,
"step": 983
},
{
"epoch": 2.125269978401728,
"grad_norm": 0.12164044380187988,
"learning_rate": 2.3741268501935212e-06,
"loss": 0.3557,
"step": 984
},
{
"epoch": 2.127429805615551,
"grad_norm": 0.11805900186300278,
"learning_rate": 2.3634412426235886e-06,
"loss": 0.3665,
"step": 985
},
{
"epoch": 2.1295896328293735,
"grad_norm": 0.12578925490379333,
"learning_rate": 2.3527722889917147e-06,
"loss": 0.3617,
"step": 986
},
{
"epoch": 2.1317494600431965,
"grad_norm": 0.11140415817499161,
"learning_rate": 2.3421200566888096e-06,
"loss": 0.3529,
"step": 987
},
{
"epoch": 2.1339092872570196,
"grad_norm": 0.12330644577741623,
"learning_rate": 2.3314846130001622e-06,
"loss": 0.3512,
"step": 988
},
{
"epoch": 2.136069114470842,
"grad_norm": 0.11442252993583679,
"learning_rate": 2.320866025105016e-06,
"loss": 0.3527,
"step": 989
},
{
"epoch": 2.138228941684665,
"grad_norm": 0.11933194845914841,
"learning_rate": 2.3102643600761445e-06,
"loss": 0.3481,
"step": 990
},
{
"epoch": 2.140388768898488,
"grad_norm": 0.11264543980360031,
"learning_rate": 2.299679684879421e-06,
"loss": 0.3583,
"step": 991
},
{
"epoch": 2.142548596112311,
"grad_norm": 0.11280512809753418,
"learning_rate": 2.289112066373411e-06,
"loss": 0.3629,
"step": 992
},
{
"epoch": 2.1447084233261338,
"grad_norm": 0.11324049532413483,
"learning_rate": 2.2785615713089363e-06,
"loss": 0.3609,
"step": 993
},
{
"epoch": 2.146868250539957,
"grad_norm": 0.10841232538223267,
"learning_rate": 2.268028266328655e-06,
"loss": 0.3613,
"step": 994
},
{
"epoch": 2.14902807775378,
"grad_norm": 0.1152244582772255,
"learning_rate": 2.25751221796665e-06,
"loss": 0.3571,
"step": 995
},
{
"epoch": 2.1511879049676024,
"grad_norm": 0.11110089719295502,
"learning_rate": 2.247013492647994e-06,
"loss": 0.3548,
"step": 996
},
{
"epoch": 2.1533477321814254,
"grad_norm": 0.11328666657209396,
"learning_rate": 2.2365321566883437e-06,
"loss": 0.3586,
"step": 997
},
{
"epoch": 2.1555075593952484,
"grad_norm": 0.11004538089036942,
"learning_rate": 2.2260682762935137e-06,
"loss": 0.3565,
"step": 998
},
{
"epoch": 2.1576673866090714,
"grad_norm": 0.11562500894069672,
"learning_rate": 2.2156219175590623e-06,
"loss": 0.3619,
"step": 999
},
{
"epoch": 2.159827213822894,
"grad_norm": 0.11296035349369049,
"learning_rate": 2.2051931464698636e-06,
"loss": 0.3656,
"step": 1000
},
{
"epoch": 2.161987041036717,
"grad_norm": 0.11270337551832199,
"learning_rate": 2.1947820288997067e-06,
"loss": 0.3496,
"step": 1001
},
{
"epoch": 2.16414686825054,
"grad_norm": 0.11527759581804276,
"learning_rate": 2.1843886306108686e-06,
"loss": 0.3695,
"step": 1002
},
{
"epoch": 2.166306695464363,
"grad_norm": 0.11408324539661407,
"learning_rate": 2.174013017253701e-06,
"loss": 0.3651,
"step": 1003
},
{
"epoch": 2.1684665226781856,
"grad_norm": 0.10843408107757568,
"learning_rate": 2.1636552543662187e-06,
"loss": 0.3692,
"step": 1004
},
{
"epoch": 2.1706263498920086,
"grad_norm": 0.11223003268241882,
"learning_rate": 2.153315407373679e-06,
"loss": 0.3545,
"step": 1005
},
{
"epoch": 2.1727861771058317,
"grad_norm": 0.11480898410081863,
"learning_rate": 2.1429935415881753e-06,
"loss": 0.3609,
"step": 1006
},
{
"epoch": 2.1749460043196542,
"grad_norm": 0.1133100613951683,
"learning_rate": 2.132689722208223e-06,
"loss": 0.361,
"step": 1007
},
{
"epoch": 2.1771058315334773,
"grad_norm": 0.11355537176132202,
"learning_rate": 2.1224040143183444e-06,
"loss": 0.3681,
"step": 1008
},
{
"epoch": 2.1792656587473003,
"grad_norm": 0.11831656098365784,
"learning_rate": 2.112136482888663e-06,
"loss": 0.3555,
"step": 1009
},
{
"epoch": 2.1814254859611233,
"grad_norm": 0.11772197484970093,
"learning_rate": 2.1018871927744844e-06,
"loss": 0.3604,
"step": 1010
},
{
"epoch": 2.183585313174946,
"grad_norm": 0.10822444409132004,
"learning_rate": 2.0916562087158964e-06,
"loss": 0.3583,
"step": 1011
},
{
"epoch": 2.185745140388769,
"grad_norm": 0.21270522475242615,
"learning_rate": 2.0814435953373554e-06,
"loss": 0.3651,
"step": 1012
},
{
"epoch": 2.187904967602592,
"grad_norm": 0.11271930485963821,
"learning_rate": 2.0712494171472776e-06,
"loss": 0.367,
"step": 1013
},
{
"epoch": 2.190064794816415,
"grad_norm": 0.1191214919090271,
"learning_rate": 2.061073738537635e-06,
"loss": 0.3566,
"step": 1014
},
{
"epoch": 2.1922246220302375,
"grad_norm": 0.1228100061416626,
"learning_rate": 2.0509166237835398e-06,
"loss": 0.3553,
"step": 1015
},
{
"epoch": 2.1943844492440605,
"grad_norm": 0.11371961981058121,
"learning_rate": 2.040778137042852e-06,
"loss": 0.3621,
"step": 1016
},
{
"epoch": 2.1965442764578835,
"grad_norm": 0.10948773473501205,
"learning_rate": 2.030658342355765e-06,
"loss": 0.3612,
"step": 1017
},
{
"epoch": 2.198704103671706,
"grad_norm": 0.10944036394357681,
"learning_rate": 2.0205573036443994e-06,
"loss": 0.3619,
"step": 1018
},
{
"epoch": 2.200863930885529,
"grad_norm": 0.11753126233816147,
"learning_rate": 2.0104750847124075e-06,
"loss": 0.3636,
"step": 1019
},
{
"epoch": 2.203023758099352,
"grad_norm": 0.12510347366333008,
"learning_rate": 2.0004117492445614e-06,
"loss": 0.3789,
"step": 1020
},
{
"epoch": 2.205183585313175,
"grad_norm": 0.1162487342953682,
"learning_rate": 1.990367360806359e-06,
"loss": 0.3595,
"step": 1021
},
{
"epoch": 2.2073434125269977,
"grad_norm": 0.12260331958532333,
"learning_rate": 1.980341982843616e-06,
"loss": 0.3659,
"step": 1022
},
{
"epoch": 2.2095032397408207,
"grad_norm": 0.11793196201324463,
"learning_rate": 1.9703356786820687e-06,
"loss": 0.3644,
"step": 1023
},
{
"epoch": 2.2116630669546438,
"grad_norm": 0.11070533841848373,
"learning_rate": 1.9603485115269743e-06,
"loss": 0.3587,
"step": 1024
},
{
"epoch": 2.2138228941684663,
"grad_norm": 0.10772062093019485,
"learning_rate": 1.9503805444627054e-06,
"loss": 0.358,
"step": 1025
},
{
"epoch": 2.2159827213822894,
"grad_norm": 0.11722833663225174,
"learning_rate": 1.9404318404523605e-06,
"loss": 0.3529,
"step": 1026
},
{
"epoch": 2.2181425485961124,
"grad_norm": 0.11525849252939224,
"learning_rate": 1.930502462337362e-06,
"loss": 0.3526,
"step": 1027
},
{
"epoch": 2.2203023758099354,
"grad_norm": 0.12186475098133087,
"learning_rate": 1.920592472837057e-06,
"loss": 0.3642,
"step": 1028
},
{
"epoch": 2.222462203023758,
"grad_norm": 0.11602187156677246,
"learning_rate": 1.910701934548329e-06,
"loss": 0.3741,
"step": 1029
},
{
"epoch": 2.224622030237581,
"grad_norm": 0.12122868001461029,
"learning_rate": 1.900830909945189e-06,
"loss": 0.3658,
"step": 1030
},
{
"epoch": 2.226781857451404,
"grad_norm": 0.11481517553329468,
"learning_rate": 1.8909794613783943e-06,
"loss": 0.3586,
"step": 1031
},
{
"epoch": 2.2289416846652266,
"grad_norm": 0.10677429288625717,
"learning_rate": 1.8811476510750486e-06,
"loss": 0.367,
"step": 1032
},
{
"epoch": 2.2311015118790496,
"grad_norm": 0.11565054953098297,
"learning_rate": 1.8713355411382117e-06,
"loss": 0.3629,
"step": 1033
},
{
"epoch": 2.2332613390928726,
"grad_norm": 0.11869722604751587,
"learning_rate": 1.8615431935464984e-06,
"loss": 0.3455,
"step": 1034
},
{
"epoch": 2.2354211663066956,
"grad_norm": 0.12298930436372757,
"learning_rate": 1.8517706701536998e-06,
"loss": 0.377,
"step": 1035
},
{
"epoch": 2.237580993520518,
"grad_norm": 0.11223292350769043,
"learning_rate": 1.8420180326883857e-06,
"loss": 0.3611,
"step": 1036
},
{
"epoch": 2.239740820734341,
"grad_norm": 0.10755477845668793,
"learning_rate": 1.8322853427535148e-06,
"loss": 0.3636,
"step": 1037
},
{
"epoch": 2.2419006479481642,
"grad_norm": 0.11490552872419357,
"learning_rate": 1.822572661826047e-06,
"loss": 0.3606,
"step": 1038
},
{
"epoch": 2.2440604751619873,
"grad_norm": 0.11000396311283112,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.3643,
"step": 1039
},
{
"epoch": 2.24622030237581,
"grad_norm": 0.10895387083292007,
"learning_rate": 1.803207572268826e-06,
"loss": 0.3623,
"step": 1040
},
{
"epoch": 2.248380129589633,
"grad_norm": 0.11881309747695923,
"learning_rate": 1.7935552859595058e-06,
"loss": 0.3598,
"step": 1041
},
{
"epoch": 2.250539956803456,
"grad_norm": 0.11568914353847504,
"learning_rate": 1.7839232532976746e-06,
"loss": 0.3652,
"step": 1042
},
{
"epoch": 2.2526997840172784,
"grad_norm": 0.10827185958623886,
"learning_rate": 1.7743115351244883e-06,
"loss": 0.3616,
"step": 1043
},
{
"epoch": 2.2548596112311015,
"grad_norm": 0.12083268910646439,
"learning_rate": 1.7647201921527802e-06,
"loss": 0.3696,
"step": 1044
},
{
"epoch": 2.2570194384449245,
"grad_norm": 0.11744555830955505,
"learning_rate": 1.7551492849666857e-06,
"loss": 0.3547,
"step": 1045
},
{
"epoch": 2.2591792656587475,
"grad_norm": 0.11333145946264267,
"learning_rate": 1.7455988740212576e-06,
"loss": 0.3648,
"step": 1046
},
{
"epoch": 2.26133909287257,
"grad_norm": 0.1083984524011612,
"learning_rate": 1.7360690196420816e-06,
"loss": 0.3609,
"step": 1047
},
{
"epoch": 2.263498920086393,
"grad_norm": 0.12069600075483322,
"learning_rate": 1.7265597820248987e-06,
"loss": 0.3617,
"step": 1048
},
{
"epoch": 2.265658747300216,
"grad_norm": 0.11563380807638168,
"learning_rate": 1.7170712212352187e-06,
"loss": 0.3554,
"step": 1049
},
{
"epoch": 2.267818574514039,
"grad_norm": 0.11244919896125793,
"learning_rate": 1.7076033972079503e-06,
"loss": 0.3526,
"step": 1050
},
{
"epoch": 2.2699784017278617,
"grad_norm": 0.11943572014570236,
"learning_rate": 1.698156369747016e-06,
"loss": 0.3639,
"step": 1051
},
{
"epoch": 2.2721382289416847,
"grad_norm": 0.11513727903366089,
"learning_rate": 1.6887301985249754e-06,
"loss": 0.3622,
"step": 1052
},
{
"epoch": 2.2742980561555077,
"grad_norm": 0.11251917481422424,
"learning_rate": 1.6793249430826502e-06,
"loss": 0.3606,
"step": 1053
},
{
"epoch": 2.2764578833693303,
"grad_norm": 0.11887813359498978,
"learning_rate": 1.6699406628287423e-06,
"loss": 0.3602,
"step": 1054
},
{
"epoch": 2.2786177105831533,
"grad_norm": 0.1043018326163292,
"learning_rate": 1.6605774170394683e-06,
"loss": 0.3597,
"step": 1055
},
{
"epoch": 2.2807775377969763,
"grad_norm": 0.11690463870763779,
"learning_rate": 1.651235264858177e-06,
"loss": 0.3706,
"step": 1056
},
{
"epoch": 2.282937365010799,
"grad_norm": 0.11119679361581802,
"learning_rate": 1.6419142652949793e-06,
"loss": 0.3755,
"step": 1057
},
{
"epoch": 2.285097192224622,
"grad_norm": 0.12275518476963043,
"learning_rate": 1.6326144772263752e-06,
"loss": 0.3617,
"step": 1058
},
{
"epoch": 2.287257019438445,
"grad_norm": 0.11455702781677246,
"learning_rate": 1.6233359593948777e-06,
"loss": 0.3561,
"step": 1059
},
{
"epoch": 2.289416846652268,
"grad_norm": 0.1072060838341713,
"learning_rate": 1.6140787704086502e-06,
"loss": 0.3595,
"step": 1060
},
{
"epoch": 2.2915766738660905,
"grad_norm": 0.11446718126535416,
"learning_rate": 1.6048429687411294e-06,
"loss": 0.3579,
"step": 1061
},
{
"epoch": 2.2937365010799136,
"grad_norm": 0.1233833059668541,
"learning_rate": 1.5956286127306591e-06,
"loss": 0.3571,
"step": 1062
},
{
"epoch": 2.2958963282937366,
"grad_norm": 0.11054225265979767,
"learning_rate": 1.586435760580118e-06,
"loss": 0.3592,
"step": 1063
},
{
"epoch": 2.2980561555075596,
"grad_norm": 0.11470706015825272,
"learning_rate": 1.5772644703565564e-06,
"loss": 0.3602,
"step": 1064
},
{
"epoch": 2.300215982721382,
"grad_norm": 0.1131376326084137,
"learning_rate": 1.5681147999908308e-06,
"loss": 0.3579,
"step": 1065
},
{
"epoch": 2.302375809935205,
"grad_norm": 0.1124383881688118,
"learning_rate": 1.5589868072772279e-06,
"loss": 0.3592,
"step": 1066
},
{
"epoch": 2.304535637149028,
"grad_norm": 0.13568998873233795,
"learning_rate": 1.5498805498731146e-06,
"loss": 0.3687,
"step": 1067
},
{
"epoch": 2.306695464362851,
"grad_norm": 0.11868295818567276,
"learning_rate": 1.5407960852985582e-06,
"loss": 0.3741,
"step": 1068
},
{
"epoch": 2.308855291576674,
"grad_norm": 0.11386443674564362,
"learning_rate": 1.531733470935976e-06,
"loss": 0.3702,
"step": 1069
},
{
"epoch": 2.311015118790497,
"grad_norm": 0.11155420541763306,
"learning_rate": 1.5226927640297663e-06,
"loss": 0.3543,
"step": 1070
},
{
"epoch": 2.31317494600432,
"grad_norm": 0.11469469219446182,
"learning_rate": 1.5136740216859464e-06,
"loss": 0.3718,
"step": 1071
},
{
"epoch": 2.3153347732181424,
"grad_norm": 0.10761052370071411,
"learning_rate": 1.5046773008717968e-06,
"loss": 0.3728,
"step": 1072
},
{
"epoch": 2.3174946004319654,
"grad_norm": 0.10855443775653839,
"learning_rate": 1.4957026584154926e-06,
"loss": 0.3612,
"step": 1073
},
{
"epoch": 2.3196544276457884,
"grad_norm": 0.11300813406705856,
"learning_rate": 1.4867501510057548e-06,
"loss": 0.3629,
"step": 1074
},
{
"epoch": 2.3218142548596115,
"grad_norm": 0.1190650686621666,
"learning_rate": 1.4778198351914853e-06,
"loss": 0.358,
"step": 1075
},
{
"epoch": 2.323974082073434,
"grad_norm": 0.11081087589263916,
"learning_rate": 1.4689117673814135e-06,
"loss": 0.3579,
"step": 1076
},
{
"epoch": 2.326133909287257,
"grad_norm": 0.10845163464546204,
"learning_rate": 1.4600260038437376e-06,
"loss": 0.3547,
"step": 1077
},
{
"epoch": 2.32829373650108,
"grad_norm": 0.10712606459856033,
"learning_rate": 1.4511626007057667e-06,
"loss": 0.3702,
"step": 1078
},
{
"epoch": 2.3304535637149026,
"grad_norm": 0.10121244937181473,
"learning_rate": 1.4423216139535735e-06,
"loss": 0.3701,
"step": 1079
},
{
"epoch": 2.3326133909287257,
"grad_norm": 0.10943249613046646,
"learning_rate": 1.4335030994316357e-06,
"loss": 0.3673,
"step": 1080
},
{
"epoch": 2.3347732181425487,
"grad_norm": 0.11610903590917587,
"learning_rate": 1.4247071128424838e-06,
"loss": 0.3603,
"step": 1081
},
{
"epoch": 2.3369330453563713,
"grad_norm": 0.11263252794742584,
"learning_rate": 1.4159337097463515e-06,
"loss": 0.3646,
"step": 1082
},
{
"epoch": 2.3390928725701943,
"grad_norm": 0.11808553338050842,
"learning_rate": 1.407182945560817e-06,
"loss": 0.3551,
"step": 1083
},
{
"epoch": 2.3412526997840173,
"grad_norm": 0.11071130633354187,
"learning_rate": 1.3984548755604655e-06,
"loss": 0.3591,
"step": 1084
},
{
"epoch": 2.3434125269978403,
"grad_norm": 0.10774732381105423,
"learning_rate": 1.38974955487653e-06,
"loss": 0.3701,
"step": 1085
},
{
"epoch": 2.345572354211663,
"grad_norm": 0.10596179217100143,
"learning_rate": 1.3810670384965469e-06,
"loss": 0.3619,
"step": 1086
},
{
"epoch": 2.347732181425486,
"grad_norm": 0.10586302727460861,
"learning_rate": 1.372407381264011e-06,
"loss": 0.3671,
"step": 1087
},
{
"epoch": 2.349892008639309,
"grad_norm": 0.11271238327026367,
"learning_rate": 1.3637706378780209e-06,
"loss": 0.369,
"step": 1088
},
{
"epoch": 2.352051835853132,
"grad_norm": 0.11300753057003021,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.366,
"step": 1089
},
{
"epoch": 2.3542116630669545,
"grad_norm": 0.10634942352771759,
"learning_rate": 1.346566110718061e-06,
"loss": 0.3608,
"step": 1090
},
{
"epoch": 2.3563714902807775,
"grad_norm": 0.11670755594968796,
"learning_rate": 1.337998435617235e-06,
"loss": 0.3649,
"step": 1091
},
{
"epoch": 2.3585313174946005,
"grad_norm": 0.11227838695049286,
"learning_rate": 1.3294538917085586e-06,
"loss": 0.3496,
"step": 1092
},
{
"epoch": 2.360691144708423,
"grad_norm": 0.11369525641202927,
"learning_rate": 1.3209325329640126e-06,
"loss": 0.367,
"step": 1093
},
{
"epoch": 2.362850971922246,
"grad_norm": 0.10753148049116135,
"learning_rate": 1.312434413209131e-06,
"loss": 0.3654,
"step": 1094
},
{
"epoch": 2.365010799136069,
"grad_norm": 0.11079417914152145,
"learning_rate": 1.3039595861226579e-06,
"loss": 0.3535,
"step": 1095
},
{
"epoch": 2.367170626349892,
"grad_norm": 0.10849615931510925,
"learning_rate": 1.2955081052362072e-06,
"loss": 0.3584,
"step": 1096
},
{
"epoch": 2.3693304535637147,
"grad_norm": 0.10960622876882553,
"learning_rate": 1.2870800239339237e-06,
"loss": 0.3578,
"step": 1097
},
{
"epoch": 2.3714902807775378,
"grad_norm": 0.11225436627864838,
"learning_rate": 1.2786753954521508e-06,
"loss": 0.3645,
"step": 1098
},
{
"epoch": 2.373650107991361,
"grad_norm": 0.11186996102333069,
"learning_rate": 1.2702942728790897e-06,
"loss": 0.3564,
"step": 1099
},
{
"epoch": 2.375809935205184,
"grad_norm": 0.10800560563802719,
"learning_rate": 1.2619367091544654e-06,
"loss": 0.3595,
"step": 1100
},
{
"epoch": 2.3779697624190064,
"grad_norm": 0.11503534764051437,
"learning_rate": 1.2536027570691938e-06,
"loss": 0.363,
"step": 1101
},
{
"epoch": 2.3801295896328294,
"grad_norm": 0.1053680032491684,
"learning_rate": 1.2452924692650443e-06,
"loss": 0.3668,
"step": 1102
},
{
"epoch": 2.3822894168466524,
"grad_norm": 0.10837449133396149,
"learning_rate": 1.2370058982343109e-06,
"loss": 0.3646,
"step": 1103
},
{
"epoch": 2.384449244060475,
"grad_norm": 0.10401103645563126,
"learning_rate": 1.2287430963194807e-06,
"loss": 0.3523,
"step": 1104
},
{
"epoch": 2.386609071274298,
"grad_norm": 0.1130133643746376,
"learning_rate": 1.2205041157129017e-06,
"loss": 0.3522,
"step": 1105
},
{
"epoch": 2.388768898488121,
"grad_norm": 0.11143437772989273,
"learning_rate": 1.2122890084564542e-06,
"loss": 0.3622,
"step": 1106
},
{
"epoch": 2.390928725701944,
"grad_norm": 0.1088298037648201,
"learning_rate": 1.204097826441218e-06,
"loss": 0.3524,
"step": 1107
},
{
"epoch": 2.3930885529157666,
"grad_norm": 0.11658685654401779,
"learning_rate": 1.1959306214071508e-06,
"loss": 0.3649,
"step": 1108
},
{
"epoch": 2.3952483801295896,
"grad_norm": 0.10530900955200195,
"learning_rate": 1.18778744494276e-06,
"loss": 0.3732,
"step": 1109
},
{
"epoch": 2.3974082073434126,
"grad_norm": 0.10576412826776505,
"learning_rate": 1.1796683484847731e-06,
"loss": 0.3528,
"step": 1110
},
{
"epoch": 2.3995680345572357,
"grad_norm": 0.10664583742618561,
"learning_rate": 1.1715733833178178e-06,
"loss": 0.3638,
"step": 1111
},
{
"epoch": 2.4017278617710582,
"grad_norm": 0.11170324683189392,
"learning_rate": 1.1635026005740902e-06,
"loss": 0.3632,
"step": 1112
},
{
"epoch": 2.4038876889848813,
"grad_norm": 0.10899297147989273,
"learning_rate": 1.1554560512330437e-06,
"loss": 0.3717,
"step": 1113
},
{
"epoch": 2.4060475161987043,
"grad_norm": 0.10355883091688156,
"learning_rate": 1.1474337861210543e-06,
"loss": 0.3669,
"step": 1114
},
{
"epoch": 2.408207343412527,
"grad_norm": 0.11601343005895615,
"learning_rate": 1.1394358559111101e-06,
"loss": 0.3675,
"step": 1115
},
{
"epoch": 2.41036717062635,
"grad_norm": 0.10625651478767395,
"learning_rate": 1.1314623111224865e-06,
"loss": 0.3696,
"step": 1116
},
{
"epoch": 2.412526997840173,
"grad_norm": 0.1087704598903656,
"learning_rate": 1.1235132021204226e-06,
"loss": 0.3678,
"step": 1117
},
{
"epoch": 2.4146868250539955,
"grad_norm": 0.1125335842370987,
"learning_rate": 1.1155885791158128e-06,
"loss": 0.3676,
"step": 1118
},
{
"epoch": 2.4168466522678185,
"grad_norm": 0.10977572947740555,
"learning_rate": 1.1076884921648834e-06,
"loss": 0.3597,
"step": 1119
},
{
"epoch": 2.4190064794816415,
"grad_norm": 0.11624909937381744,
"learning_rate": 1.0998129911688766e-06,
"loss": 0.3645,
"step": 1120
},
{
"epoch": 2.4211663066954645,
"grad_norm": 0.11193333566188812,
"learning_rate": 1.0919621258737384e-06,
"loss": 0.3679,
"step": 1121
},
{
"epoch": 2.423326133909287,
"grad_norm": 0.10702624171972275,
"learning_rate": 1.0841359458697986e-06,
"loss": 0.3675,
"step": 1122
},
{
"epoch": 2.42548596112311,
"grad_norm": 0.11081477999687195,
"learning_rate": 1.0763345005914649e-06,
"loss": 0.3733,
"step": 1123
},
{
"epoch": 2.427645788336933,
"grad_norm": 0.11152873933315277,
"learning_rate": 1.0685578393169054e-06,
"loss": 0.3634,
"step": 1124
},
{
"epoch": 2.429805615550756,
"grad_norm": 0.11278684437274933,
"learning_rate": 1.0608060111677409e-06,
"loss": 0.3646,
"step": 1125
},
{
"epoch": 2.4319654427645787,
"grad_norm": 0.10329707711935043,
"learning_rate": 1.053079065108728e-06,
"loss": 0.3616,
"step": 1126
},
{
"epoch": 2.4341252699784017,
"grad_norm": 0.11579885333776474,
"learning_rate": 1.0453770499474585e-06,
"loss": 0.3642,
"step": 1127
},
{
"epoch": 2.4362850971922247,
"grad_norm": 0.11287212371826172,
"learning_rate": 1.037700014334047e-06,
"loss": 0.3588,
"step": 1128
},
{
"epoch": 2.4384449244060473,
"grad_norm": 0.10435645282268524,
"learning_rate": 1.0300480067608232e-06,
"loss": 0.3621,
"step": 1129
},
{
"epoch": 2.4406047516198703,
"grad_norm": 0.1117047443985939,
"learning_rate": 1.0224210755620257e-06,
"loss": 0.3665,
"step": 1130
},
{
"epoch": 2.4427645788336934,
"grad_norm": 0.11821126937866211,
"learning_rate": 1.014819268913495e-06,
"loss": 0.3659,
"step": 1131
},
{
"epoch": 2.4449244060475164,
"grad_norm": 0.11257217824459076,
"learning_rate": 1.0072426348323754e-06,
"loss": 0.3629,
"step": 1132
},
{
"epoch": 2.447084233261339,
"grad_norm": 0.10960426181554794,
"learning_rate": 9.99691221176805e-07,
"loss": 0.3702,
"step": 1133
},
{
"epoch": 2.449244060475162,
"grad_norm": 0.11274091899394989,
"learning_rate": 9.921650756456164e-07,
"loss": 0.3552,
"step": 1134
},
{
"epoch": 2.451403887688985,
"grad_norm": 0.11033818125724792,
"learning_rate": 9.84664245778037e-07,
"loss": 0.3622,
"step": 1135
},
{
"epoch": 2.453563714902808,
"grad_norm": 0.11202115565538406,
"learning_rate": 9.771887789533818e-07,
"loss": 0.3641,
"step": 1136
},
{
"epoch": 2.4557235421166306,
"grad_norm": 0.10436037182807922,
"learning_rate": 9.69738722390765e-07,
"loss": 0.3722,
"step": 1137
},
{
"epoch": 2.4578833693304536,
"grad_norm": 0.11260079592466354,
"learning_rate": 9.623141231487904e-07,
"loss": 0.3664,
"step": 1138
},
{
"epoch": 2.4600431965442766,
"grad_norm": 0.10981511324644089,
"learning_rate": 9.549150281252633e-07,
"loss": 0.3729,
"step": 1139
},
{
"epoch": 2.462203023758099,
"grad_norm": 0.11340730637311935,
"learning_rate": 9.475414840568903e-07,
"loss": 0.3614,
"step": 1140
},
{
"epoch": 2.464362850971922,
"grad_norm": 0.10501902550458908,
"learning_rate": 9.401935375189802e-07,
"loss": 0.3601,
"step": 1141
},
{
"epoch": 2.466522678185745,
"grad_norm": 0.11369086056947708,
"learning_rate": 9.32871234925159e-07,
"loss": 0.3669,
"step": 1142
},
{
"epoch": 2.468682505399568,
"grad_norm": 0.10842647403478622,
"learning_rate": 9.255746225270689e-07,
"loss": 0.3582,
"step": 1143
},
{
"epoch": 2.470842332613391,
"grad_norm": 0.1089843288064003,
"learning_rate": 9.183037464140804e-07,
"loss": 0.3532,
"step": 1144
},
{
"epoch": 2.473002159827214,
"grad_norm": 0.1023058295249939,
"learning_rate": 9.110586525129988e-07,
"loss": 0.3473,
"step": 1145
},
{
"epoch": 2.475161987041037,
"grad_norm": 0.1110844761133194,
"learning_rate": 9.038393865877725e-07,
"loss": 0.3629,
"step": 1146
},
{
"epoch": 2.4773218142548594,
"grad_norm": 0.10468819737434387,
"learning_rate": 8.966459942392108e-07,
"loss": 0.3631,
"step": 1147
},
{
"epoch": 2.4794816414686824,
"grad_norm": 0.11002985388040543,
"learning_rate": 8.894785209046886e-07,
"loss": 0.3584,
"step": 1148
},
{
"epoch": 2.4816414686825055,
"grad_norm": 0.10573374480009079,
"learning_rate": 8.823370118578628e-07,
"loss": 0.3681,
"step": 1149
},
{
"epoch": 2.4838012958963285,
"grad_norm": 0.11796517670154572,
"learning_rate": 8.752215122083874e-07,
"loss": 0.3617,
"step": 1150
},
{
"epoch": 2.485961123110151,
"grad_norm": 0.1184302419424057,
"learning_rate": 8.68132066901623e-07,
"loss": 0.3672,
"step": 1151
},
{
"epoch": 2.488120950323974,
"grad_norm": 0.13177676498889923,
"learning_rate": 8.610687207183604e-07,
"loss": 0.3573,
"step": 1152
},
{
"epoch": 2.490280777537797,
"grad_norm": 0.11671025305986404,
"learning_rate": 8.540315182745329e-07,
"loss": 0.3569,
"step": 1153
},
{
"epoch": 2.4924406047516197,
"grad_norm": 0.10741881281137466,
"learning_rate": 8.470205040209362e-07,
"loss": 0.3558,
"step": 1154
},
{
"epoch": 2.4946004319654427,
"grad_norm": 0.12825675308704376,
"learning_rate": 8.400357222429473e-07,
"loss": 0.3575,
"step": 1155
},
{
"epoch": 2.4967602591792657,
"grad_norm": 0.10776403546333313,
"learning_rate": 8.330772170602424e-07,
"loss": 0.3589,
"step": 1156
},
{
"epoch": 2.4989200863930887,
"grad_norm": 0.11745335906744003,
"learning_rate": 8.261450324265225e-07,
"loss": 0.3617,
"step": 1157
},
{
"epoch": 2.5010799136069113,
"grad_norm": 0.10803595185279846,
"learning_rate": 8.192392121292336e-07,
"loss": 0.3636,
"step": 1158
},
{
"epoch": 2.5032397408207343,
"grad_norm": 0.11620043963193893,
"learning_rate": 8.123597997892918e-07,
"loss": 0.3688,
"step": 1159
},
{
"epoch": 2.5053995680345573,
"grad_norm": 0.11771270632743835,
"learning_rate": 8.055068388608011e-07,
"loss": 0.3633,
"step": 1160
},
{
"epoch": 2.5075593952483803,
"grad_norm": 0.11002473533153534,
"learning_rate": 7.986803726307901e-07,
"loss": 0.3649,
"step": 1161
},
{
"epoch": 2.509719222462203,
"grad_norm": 0.11476074159145355,
"learning_rate": 7.918804442189271e-07,
"loss": 0.3482,
"step": 1162
},
{
"epoch": 2.511879049676026,
"grad_norm": 0.10824240744113922,
"learning_rate": 7.851070965772572e-07,
"loss": 0.3502,
"step": 1163
},
{
"epoch": 2.514038876889849,
"grad_norm": 0.11206220835447311,
"learning_rate": 7.783603724899258e-07,
"loss": 0.3668,
"step": 1164
},
{
"epoch": 2.5161987041036715,
"grad_norm": 0.11207690834999084,
"learning_rate": 7.716403145729073e-07,
"loss": 0.3585,
"step": 1165
},
{
"epoch": 2.5183585313174945,
"grad_norm": 0.10834087431430817,
"learning_rate": 7.649469652737407e-07,
"loss": 0.3557,
"step": 1166
},
{
"epoch": 2.5205183585313176,
"grad_norm": 0.11165751516819,
"learning_rate": 7.582803668712579e-07,
"loss": 0.3654,
"step": 1167
},
{
"epoch": 2.52267818574514,
"grad_norm": 0.10847879201173782,
"learning_rate": 7.51640561475318e-07,
"loss": 0.362,
"step": 1168
},
{
"epoch": 2.524838012958963,
"grad_norm": 0.11347544938325882,
"learning_rate": 7.450275910265415e-07,
"loss": 0.3631,
"step": 1169
},
{
"epoch": 2.526997840172786,
"grad_norm": 0.11547064036130905,
"learning_rate": 7.384414972960419e-07,
"loss": 0.3613,
"step": 1170
},
{
"epoch": 2.529157667386609,
"grad_norm": 0.11166190356016159,
"learning_rate": 7.318823218851668e-07,
"loss": 0.3664,
"step": 1171
},
{
"epoch": 2.531317494600432,
"grad_norm": 0.11519124358892441,
"learning_rate": 7.253501062252338e-07,
"loss": 0.3715,
"step": 1172
},
{
"epoch": 2.533477321814255,
"grad_norm": 0.12818704545497894,
"learning_rate": 7.188448915772673e-07,
"loss": 0.3568,
"step": 1173
},
{
"epoch": 2.535637149028078,
"grad_norm": 0.11333166062831879,
"learning_rate": 7.123667190317396e-07,
"loss": 0.366,
"step": 1174
},
{
"epoch": 2.537796976241901,
"grad_norm": 0.11098440736532211,
"learning_rate": 7.059156295083064e-07,
"loss": 0.3651,
"step": 1175
},
{
"epoch": 2.5399568034557234,
"grad_norm": 0.11005040258169174,
"learning_rate": 6.994916637555571e-07,
"loss": 0.3658,
"step": 1176
},
{
"epoch": 2.5421166306695464,
"grad_norm": 0.10551054775714874,
"learning_rate": 6.930948623507505e-07,
"loss": 0.3654,
"step": 1177
},
{
"epoch": 2.5442764578833694,
"grad_norm": 0.10704758763313293,
"learning_rate": 6.86725265699561e-07,
"loss": 0.3562,
"step": 1178
},
{
"epoch": 2.546436285097192,
"grad_norm": 0.1092720702290535,
"learning_rate": 6.803829140358237e-07,
"loss": 0.3619,
"step": 1179
},
{
"epoch": 2.548596112311015,
"grad_norm": 0.10640691220760345,
"learning_rate": 6.74067847421277e-07,
"loss": 0.3674,
"step": 1180
},
{
"epoch": 2.550755939524838,
"grad_norm": 0.10517946630716324,
"learning_rate": 6.677801057453143e-07,
"loss": 0.3556,
"step": 1181
},
{
"epoch": 2.552915766738661,
"grad_norm": 0.10489367693662643,
"learning_rate": 6.615197287247299e-07,
"loss": 0.3766,
"step": 1182
},
{
"epoch": 2.555075593952484,
"grad_norm": 0.11467967927455902,
"learning_rate": 6.552867559034687e-07,
"loss": 0.3569,
"step": 1183
},
{
"epoch": 2.5572354211663066,
"grad_norm": 0.11009713262319565,
"learning_rate": 6.490812266523716e-07,
"loss": 0.3654,
"step": 1184
},
{
"epoch": 2.5593952483801297,
"grad_norm": 0.10729658603668213,
"learning_rate": 6.429031801689362e-07,
"loss": 0.3564,
"step": 1185
},
{
"epoch": 2.5615550755939527,
"grad_norm": 0.1073872372508049,
"learning_rate": 6.36752655477062e-07,
"loss": 0.3606,
"step": 1186
},
{
"epoch": 2.5637149028077753,
"grad_norm": 0.10580222308635712,
"learning_rate": 6.30629691426804e-07,
"loss": 0.371,
"step": 1187
},
{
"epoch": 2.5658747300215983,
"grad_norm": 0.11771810799837112,
"learning_rate": 6.245343266941328e-07,
"loss": 0.3597,
"step": 1188
},
{
"epoch": 2.5680345572354213,
"grad_norm": 0.11992885917425156,
"learning_rate": 6.184665997806832e-07,
"loss": 0.3559,
"step": 1189
},
{
"epoch": 2.570194384449244,
"grad_norm": 0.11079053580760956,
"learning_rate": 6.124265490135161e-07,
"loss": 0.3635,
"step": 1190
},
{
"epoch": 2.572354211663067,
"grad_norm": 0.10871004313230515,
"learning_rate": 6.064142125448763e-07,
"loss": 0.3625,
"step": 1191
},
{
"epoch": 2.57451403887689,
"grad_norm": 0.11944089829921722,
"learning_rate": 6.004296283519478e-07,
"loss": 0.3531,
"step": 1192
},
{
"epoch": 2.5766738660907125,
"grad_norm": 0.11835870891809464,
"learning_rate": 5.944728342366179e-07,
"loss": 0.3596,
"step": 1193
},
{
"epoch": 2.5788336933045355,
"grad_norm": 0.10851329565048218,
"learning_rate": 5.885438678252342e-07,
"loss": 0.3692,
"step": 1194
},
{
"epoch": 2.5809935205183585,
"grad_norm": 0.10725897550582886,
"learning_rate": 5.826427665683715e-07,
"loss": 0.3621,
"step": 1195
},
{
"epoch": 2.5831533477321815,
"grad_norm": 0.10977955162525177,
"learning_rate": 5.767695677405921e-07,
"loss": 0.3536,
"step": 1196
},
{
"epoch": 2.5853131749460045,
"grad_norm": 0.11643577367067337,
"learning_rate": 5.709243084402128e-07,
"loss": 0.3624,
"step": 1197
},
{
"epoch": 2.587473002159827,
"grad_norm": 0.11957161873579025,
"learning_rate": 5.651070255890689e-07,
"loss": 0.3567,
"step": 1198
},
{
"epoch": 2.58963282937365,
"grad_norm": 0.11547524482011795,
"learning_rate": 5.593177559322776e-07,
"loss": 0.3526,
"step": 1199
},
{
"epoch": 2.591792656587473,
"grad_norm": 0.10810908675193787,
"learning_rate": 5.535565360380146e-07,
"loss": 0.3627,
"step": 1200
},
{
"epoch": 2.5939524838012957,
"grad_norm": 0.10978656262159348,
"learning_rate": 5.478234022972756e-07,
"loss": 0.3689,
"step": 1201
},
{
"epoch": 2.5961123110151187,
"grad_norm": 0.11710033565759659,
"learning_rate": 5.421183909236494e-07,
"loss": 0.354,
"step": 1202
},
{
"epoch": 2.5982721382289418,
"grad_norm": 0.10731150209903717,
"learning_rate": 5.364415379530891e-07,
"loss": 0.3672,
"step": 1203
},
{
"epoch": 2.6004319654427643,
"grad_norm": 0.10609705001115799,
"learning_rate": 5.307928792436812e-07,
"loss": 0.3541,
"step": 1204
},
{
"epoch": 2.6025917926565874,
"grad_norm": 0.11076472699642181,
"learning_rate": 5.251724504754258e-07,
"loss": 0.3651,
"step": 1205
},
{
"epoch": 2.6047516198704104,
"grad_norm": 0.11111797392368317,
"learning_rate": 5.19580287150005e-07,
"loss": 0.3557,
"step": 1206
},
{
"epoch": 2.6069114470842334,
"grad_norm": 0.10651623457670212,
"learning_rate": 5.140164245905633e-07,
"loss": 0.3537,
"step": 1207
},
{
"epoch": 2.6090712742980564,
"grad_norm": 0.11073900759220123,
"learning_rate": 5.084808979414779e-07,
"loss": 0.3623,
"step": 1208
},
{
"epoch": 2.611231101511879,
"grad_norm": 0.11509796231985092,
"learning_rate": 5.029737421681446e-07,
"loss": 0.3669,
"step": 1209
},
{
"epoch": 2.613390928725702,
"grad_norm": 0.11190790683031082,
"learning_rate": 4.97494992056754e-07,
"loss": 0.3662,
"step": 1210
},
{
"epoch": 2.615550755939525,
"grad_norm": 0.11598829925060272,
"learning_rate": 4.920446822140673e-07,
"loss": 0.3617,
"step": 1211
},
{
"epoch": 2.6177105831533476,
"grad_norm": 0.11533954739570618,
"learning_rate": 4.866228470672041e-07,
"loss": 0.3589,
"step": 1212
},
{
"epoch": 2.6198704103671706,
"grad_norm": 0.10564534366130829,
"learning_rate": 4.812295208634238e-07,
"loss": 0.3626,
"step": 1213
},
{
"epoch": 2.6220302375809936,
"grad_norm": 0.11170712113380432,
"learning_rate": 4.758647376699033e-07,
"loss": 0.3672,
"step": 1214
},
{
"epoch": 2.624190064794816,
"grad_norm": 0.11519314348697662,
"learning_rate": 4.705285313735297e-07,
"loss": 0.3666,
"step": 1215
},
{
"epoch": 2.626349892008639,
"grad_norm": 0.11605649441480637,
"learning_rate": 4.6522093568068307e-07,
"loss": 0.3484,
"step": 1216
},
{
"epoch": 2.6285097192224622,
"grad_norm": 0.11404189467430115,
"learning_rate": 4.599419841170216e-07,
"loss": 0.3555,
"step": 1217
},
{
"epoch": 2.6306695464362853,
"grad_norm": 0.11835578829050064,
"learning_rate": 4.546917100272735e-07,
"loss": 0.3552,
"step": 1218
},
{
"epoch": 2.632829373650108,
"grad_norm": 0.11513664573431015,
"learning_rate": 4.494701465750217e-07,
"loss": 0.3522,
"step": 1219
},
{
"epoch": 2.634989200863931,
"grad_norm": 0.11740648001432419,
"learning_rate": 4.4427732674250045e-07,
"loss": 0.3625,
"step": 1220
},
{
"epoch": 2.637149028077754,
"grad_norm": 0.12071909755468369,
"learning_rate": 4.391132833303807e-07,
"loss": 0.3684,
"step": 1221
},
{
"epoch": 2.639308855291577,
"grad_norm": 0.1136975884437561,
"learning_rate": 4.3397804895756957e-07,
"loss": 0.3684,
"step": 1222
},
{
"epoch": 2.6414686825053995,
"grad_norm": 0.11149821430444717,
"learning_rate": 4.2887165606099513e-07,
"loss": 0.3603,
"step": 1223
},
{
"epoch": 2.6436285097192225,
"grad_norm": 0.12100395560264587,
"learning_rate": 4.237941368954124e-07,
"loss": 0.3624,
"step": 1224
},
{
"epoch": 2.6457883369330455,
"grad_norm": 0.1222655400633812,
"learning_rate": 4.1874552353319107e-07,
"loss": 0.3526,
"step": 1225
},
{
"epoch": 2.647948164146868,
"grad_norm": 0.11921314895153046,
"learning_rate": 4.137258478641176e-07,
"loss": 0.3647,
"step": 1226
},
{
"epoch": 2.650107991360691,
"grad_norm": 0.11398887634277344,
"learning_rate": 4.087351415951918e-07,
"loss": 0.3593,
"step": 1227
},
{
"epoch": 2.652267818574514,
"grad_norm": 0.11155658215284348,
"learning_rate": 4.0377343625042587e-07,
"loss": 0.37,
"step": 1228
},
{
"epoch": 2.6544276457883367,
"grad_norm": 0.1191490963101387,
"learning_rate": 3.9884076317064813e-07,
"loss": 0.3588,
"step": 1229
},
{
"epoch": 2.6565874730021597,
"grad_norm": 0.12826910614967346,
"learning_rate": 3.9393715351330243e-07,
"loss": 0.3566,
"step": 1230
},
{
"epoch": 2.6587473002159827,
"grad_norm": 0.11224586516618729,
"learning_rate": 3.890626382522539e-07,
"loss": 0.3604,
"step": 1231
},
{
"epoch": 2.6609071274298057,
"grad_norm": 0.11304951459169388,
"learning_rate": 3.8421724817758745e-07,
"loss": 0.3719,
"step": 1232
},
{
"epoch": 2.6630669546436287,
"grad_norm": 0.10955885052680969,
"learning_rate": 3.794010138954213e-07,
"loss": 0.3611,
"step": 1233
},
{
"epoch": 2.6652267818574513,
"grad_norm": 0.11885318905115128,
"learning_rate": 3.7461396582771035e-07,
"loss": 0.3732,
"step": 1234
},
{
"epoch": 2.6673866090712743,
"grad_norm": 0.11816181242465973,
"learning_rate": 3.698561342120499e-07,
"loss": 0.3577,
"step": 1235
},
{
"epoch": 2.6695464362850974,
"grad_norm": 0.11143229156732559,
"learning_rate": 3.651275491014905e-07,
"loss": 0.3561,
"step": 1236
},
{
"epoch": 2.67170626349892,
"grad_norm": 0.113620825111866,
"learning_rate": 3.604282403643472e-07,
"loss": 0.3659,
"step": 1237
},
{
"epoch": 2.673866090712743,
"grad_norm": 0.11192460358142853,
"learning_rate": 3.557582376840063e-07,
"loss": 0.3627,
"step": 1238
},
{
"epoch": 2.676025917926566,
"grad_norm": 0.11559736728668213,
"learning_rate": 3.511175705587433e-07,
"loss": 0.3632,
"step": 1239
},
{
"epoch": 2.6781857451403885,
"grad_norm": 0.11298345029354095,
"learning_rate": 3.465062683015341e-07,
"loss": 0.3617,
"step": 1240
},
{
"epoch": 2.6803455723542116,
"grad_norm": 0.1136719286441803,
"learning_rate": 3.419243600398703e-07,
"loss": 0.3534,
"step": 1241
},
{
"epoch": 2.6825053995680346,
"grad_norm": 0.11135457456111908,
"learning_rate": 3.373718747155752e-07,
"loss": 0.3723,
"step": 1242
},
{
"epoch": 2.6846652267818576,
"grad_norm": 0.10721197724342346,
"learning_rate": 3.328488410846187e-07,
"loss": 0.3551,
"step": 1243
},
{
"epoch": 2.6868250539956806,
"grad_norm": 0.11308667808771133,
"learning_rate": 3.283552877169399e-07,
"loss": 0.3667,
"step": 1244
},
{
"epoch": 2.688984881209503,
"grad_norm": 0.10848429799079895,
"learning_rate": 3.2389124299626483e-07,
"loss": 0.3643,
"step": 1245
},
{
"epoch": 2.691144708423326,
"grad_norm": 0.11723221838474274,
"learning_rate": 3.194567351199257e-07,
"loss": 0.3717,
"step": 1246
},
{
"epoch": 2.693304535637149,
"grad_norm": 0.12472040206193924,
"learning_rate": 3.150517920986851e-07,
"loss": 0.3608,
"step": 1247
},
{
"epoch": 2.695464362850972,
"grad_norm": 0.11016938090324402,
"learning_rate": 3.106764417565561e-07,
"loss": 0.3588,
"step": 1248
},
{
"epoch": 2.697624190064795,
"grad_norm": 0.11815854161977768,
"learning_rate": 3.0633071173062966e-07,
"loss": 0.3617,
"step": 1249
},
{
"epoch": 2.699784017278618,
"grad_norm": 0.1177084818482399,
"learning_rate": 3.0201462947089865e-07,
"loss": 0.3576,
"step": 1250
},
{
"epoch": 2.7019438444924404,
"grad_norm": 0.11179111897945404,
"learning_rate": 2.9772822224008515e-07,
"loss": 0.3667,
"step": 1251
},
{
"epoch": 2.7041036717062634,
"grad_norm": 0.11454194784164429,
"learning_rate": 2.9347151711346556e-07,
"loss": 0.3707,
"step": 1252
},
{
"epoch": 2.7062634989200864,
"grad_norm": 0.10757472366094589,
"learning_rate": 2.892445409787037e-07,
"loss": 0.3628,
"step": 1253
},
{
"epoch": 2.708423326133909,
"grad_norm": 0.11914849281311035,
"learning_rate": 2.850473205356774e-07,
"loss": 0.3468,
"step": 1254
},
{
"epoch": 2.710583153347732,
"grad_norm": 0.1173713430762291,
"learning_rate": 2.8087988229631325e-07,
"loss": 0.3668,
"step": 1255
},
{
"epoch": 2.712742980561555,
"grad_norm": 0.11365855485200882,
"learning_rate": 2.76742252584416e-07,
"loss": 0.359,
"step": 1256
},
{
"epoch": 2.714902807775378,
"grad_norm": 0.11546127498149872,
"learning_rate": 2.7263445753550275e-07,
"loss": 0.364,
"step": 1257
},
{
"epoch": 2.717062634989201,
"grad_norm": 0.11186777800321579,
"learning_rate": 2.685565230966408e-07,
"loss": 0.3526,
"step": 1258
},
{
"epoch": 2.7192224622030237,
"grad_norm": 0.10442403703927994,
"learning_rate": 2.6450847502627883e-07,
"loss": 0.3551,
"step": 1259
},
{
"epoch": 2.7213822894168467,
"grad_norm": 0.12204797565937042,
"learning_rate": 2.604903388940899e-07,
"loss": 0.3587,
"step": 1260
},
{
"epoch": 2.7235421166306697,
"grad_norm": 0.11084363609552383,
"learning_rate": 2.5650214008080544e-07,
"loss": 0.3679,
"step": 1261
},
{
"epoch": 2.7257019438444923,
"grad_norm": 0.10979737341403961,
"learning_rate": 2.525439037780558e-07,
"loss": 0.3717,
"step": 1262
},
{
"epoch": 2.7278617710583153,
"grad_norm": 0.11145438998937607,
"learning_rate": 2.486156549882135e-07,
"loss": 0.3613,
"step": 1263
},
{
"epoch": 2.7300215982721383,
"grad_norm": 0.11015837639570236,
"learning_rate": 2.447174185242324e-07,
"loss": 0.3652,
"step": 1264
},
{
"epoch": 2.732181425485961,
"grad_norm": 0.1096833273768425,
"learning_rate": 2.40849219009493e-07,
"loss": 0.3531,
"step": 1265
},
{
"epoch": 2.734341252699784,
"grad_norm": 0.109636589884758,
"learning_rate": 2.3701108087764657e-07,
"loss": 0.3596,
"step": 1266
},
{
"epoch": 2.736501079913607,
"grad_norm": 0.11428305506706238,
"learning_rate": 2.3320302837245846e-07,
"loss": 0.3659,
"step": 1267
},
{
"epoch": 2.73866090712743,
"grad_norm": 0.11387787014245987,
"learning_rate": 2.2942508554765764e-07,
"loss": 0.3726,
"step": 1268
},
{
"epoch": 2.740820734341253,
"grad_norm": 0.10690239071846008,
"learning_rate": 2.2567727626678527e-07,
"loss": 0.3651,
"step": 1269
},
{
"epoch": 2.7429805615550755,
"grad_norm": 0.10845934599637985,
"learning_rate": 2.2195962420304083e-07,
"loss": 0.3608,
"step": 1270
},
{
"epoch": 2.7451403887688985,
"grad_norm": 0.11751694232225418,
"learning_rate": 2.1827215283913683e-07,
"loss": 0.3659,
"step": 1271
},
{
"epoch": 2.7473002159827216,
"grad_norm": 0.10652041435241699,
"learning_rate": 2.1461488546714425e-07,
"loss": 0.3634,
"step": 1272
},
{
"epoch": 2.749460043196544,
"grad_norm": 0.10296986997127533,
"learning_rate": 2.1098784518835292e-07,
"loss": 0.3632,
"step": 1273
},
{
"epoch": 2.751619870410367,
"grad_norm": 0.10827996581792831,
"learning_rate": 2.0739105491312028e-07,
"loss": 0.3624,
"step": 1274
},
{
"epoch": 2.75377969762419,
"grad_norm": 0.11208463460206985,
"learning_rate": 2.0382453736072838e-07,
"loss": 0.3552,
"step": 1275
},
{
"epoch": 2.7559395248380127,
"grad_norm": 0.11274047195911407,
"learning_rate": 2.0028831505924162e-07,
"loss": 0.3613,
"step": 1276
},
{
"epoch": 2.7580993520518358,
"grad_norm": 0.10478544235229492,
"learning_rate": 1.967824103453597e-07,
"loss": 0.3592,
"step": 1277
},
{
"epoch": 2.760259179265659,
"grad_norm": 0.10351528972387314,
"learning_rate": 1.9330684536428335e-07,
"loss": 0.3693,
"step": 1278
},
{
"epoch": 2.762419006479482,
"grad_norm": 0.11334282159805298,
"learning_rate": 1.8986164206957037e-07,
"loss": 0.3615,
"step": 1279
},
{
"epoch": 2.7645788336933044,
"grad_norm": 0.10871846228837967,
"learning_rate": 1.8644682222299703e-07,
"loss": 0.3644,
"step": 1280
},
{
"epoch": 2.7667386609071274,
"grad_norm": 0.10826321691274643,
"learning_rate": 1.8306240739442094e-07,
"loss": 0.3599,
"step": 1281
},
{
"epoch": 2.7688984881209504,
"grad_norm": 0.1105961948633194,
"learning_rate": 1.7970841896164658e-07,
"loss": 0.3652,
"step": 1282
},
{
"epoch": 2.7710583153347734,
"grad_norm": 0.10997821390628815,
"learning_rate": 1.7638487811028616e-07,
"loss": 0.3675,
"step": 1283
},
{
"epoch": 2.773218142548596,
"grad_norm": 0.1074373796582222,
"learning_rate": 1.7309180583363062e-07,
"loss": 0.3542,
"step": 1284
},
{
"epoch": 2.775377969762419,
"grad_norm": 0.10459216684103012,
"learning_rate": 1.6982922293251548e-07,
"loss": 0.3538,
"step": 1285
},
{
"epoch": 2.777537796976242,
"grad_norm": 0.10451044887304306,
"learning_rate": 1.6659715001518583e-07,
"loss": 0.367,
"step": 1286
},
{
"epoch": 2.7796976241900646,
"grad_norm": 0.10947411507368088,
"learning_rate": 1.6339560749717154e-07,
"loss": 0.3515,
"step": 1287
},
{
"epoch": 2.7818574514038876,
"grad_norm": 0.11110340058803558,
"learning_rate": 1.6022461560115498e-07,
"loss": 0.3603,
"step": 1288
},
{
"epoch": 2.7840172786177106,
"grad_norm": 0.10515395551919937,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.3547,
"step": 1289
},
{
"epoch": 2.786177105831533,
"grad_norm": 0.10683929920196533,
"learning_rate": 1.5397436360084784e-07,
"loss": 0.3617,
"step": 1290
},
{
"epoch": 2.7883369330453562,
"grad_norm": 0.10624652355909348,
"learning_rate": 1.5089514297654594e-07,
"loss": 0.3553,
"step": 1291
},
{
"epoch": 2.7904967602591793,
"grad_norm": 0.11002147197723389,
"learning_rate": 1.4784655193396947e-07,
"loss": 0.3557,
"step": 1292
},
{
"epoch": 2.7926565874730023,
"grad_norm": 0.1125330999493599,
"learning_rate": 1.448286097296764e-07,
"loss": 0.3544,
"step": 1293
},
{
"epoch": 2.7948164146868253,
"grad_norm": 0.11160624772310257,
"learning_rate": 1.4184133542663014e-07,
"loss": 0.3694,
"step": 1294
},
{
"epoch": 2.796976241900648,
"grad_norm": 0.10507107526063919,
"learning_rate": 1.388847478940797e-07,
"loss": 0.3713,
"step": 1295
},
{
"epoch": 2.799136069114471,
"grad_norm": 0.107913538813591,
"learning_rate": 1.3595886580743677e-07,
"loss": 0.3698,
"step": 1296
},
{
"epoch": 2.801295896328294,
"grad_norm": 0.11146403104066849,
"learning_rate": 1.330637076481639e-07,
"loss": 0.36,
"step": 1297
},
{
"epoch": 2.8034557235421165,
"grad_norm": 0.10874520242214203,
"learning_rate": 1.3019929170365376e-07,
"loss": 0.3639,
"step": 1298
},
{
"epoch": 2.8056155507559395,
"grad_norm": 0.11767850816249847,
"learning_rate": 1.2736563606711384e-07,
"loss": 0.3618,
"step": 1299
},
{
"epoch": 2.8077753779697625,
"grad_norm": 0.10746905952692032,
"learning_rate": 1.2456275863745426e-07,
"loss": 0.3624,
"step": 1300
},
{
"epoch": 2.809935205183585,
"grad_norm": 0.10965242981910706,
"learning_rate": 1.2179067711917015e-07,
"loss": 0.3732,
"step": 1301
},
{
"epoch": 2.812095032397408,
"grad_norm": 0.10720682889223099,
"learning_rate": 1.1904940902223661e-07,
"loss": 0.3661,
"step": 1302
},
{
"epoch": 2.814254859611231,
"grad_norm": 0.11190472543239594,
"learning_rate": 1.1633897166199227e-07,
"loss": 0.3572,
"step": 1303
},
{
"epoch": 2.816414686825054,
"grad_norm": 0.10630635917186737,
"learning_rate": 1.136593821590326e-07,
"loss": 0.3587,
"step": 1304
},
{
"epoch": 2.818574514038877,
"grad_norm": 0.10910697281360626,
"learning_rate": 1.1101065743910122e-07,
"loss": 0.3666,
"step": 1305
},
{
"epoch": 2.8207343412526997,
"grad_norm": 0.11752592027187347,
"learning_rate": 1.0839281423298375e-07,
"loss": 0.3638,
"step": 1306
},
{
"epoch": 2.8228941684665227,
"grad_norm": 0.11391156911849976,
"learning_rate": 1.0580586907639912e-07,
"loss": 0.3605,
"step": 1307
},
{
"epoch": 2.8250539956803458,
"grad_norm": 0.11459757387638092,
"learning_rate": 1.032498383099001e-07,
"loss": 0.365,
"step": 1308
},
{
"epoch": 2.8272138228941683,
"grad_norm": 0.10249683260917664,
"learning_rate": 1.007247380787657e-07,
"loss": 0.3609,
"step": 1309
},
{
"epoch": 2.8293736501079914,
"grad_norm": 0.11776190996170044,
"learning_rate": 9.823058433290178e-08,
"loss": 0.3667,
"step": 1310
},
{
"epoch": 2.8315334773218144,
"grad_norm": 0.10746931284666061,
"learning_rate": 9.576739282673886e-08,
"loss": 0.3598,
"step": 1311
},
{
"epoch": 2.833693304535637,
"grad_norm": 0.1106642559170723,
"learning_rate": 9.333517911913281e-08,
"loss": 0.3627,
"step": 1312
},
{
"epoch": 2.83585313174946,
"grad_norm": 0.1114298552274704,
"learning_rate": 9.093395857326714e-08,
"loss": 0.3521,
"step": 1313
},
{
"epoch": 2.838012958963283,
"grad_norm": 0.11040709167718887,
"learning_rate": 8.856374635655696e-08,
"loss": 0.3618,
"step": 1314
},
{
"epoch": 2.8401727861771056,
"grad_norm": 0.10548478364944458,
"learning_rate": 8.622455744054958e-08,
"loss": 0.3574,
"step": 1315
},
{
"epoch": 2.8423326133909286,
"grad_norm": 0.1121056005358696,
"learning_rate": 8.391640660083411e-08,
"loss": 0.3693,
"step": 1316
},
{
"epoch": 2.8444924406047516,
"grad_norm": 0.11348962038755417,
"learning_rate": 8.163930841694589e-08,
"loss": 0.3569,
"step": 1317
},
{
"epoch": 2.8466522678185746,
"grad_norm": 0.10726695507764816,
"learning_rate": 7.939327727227441e-08,
"loss": 0.3667,
"step": 1318
},
{
"epoch": 2.8488120950323976,
"grad_norm": 0.10446982830762863,
"learning_rate": 7.717832735397335e-08,
"loss": 0.3685,
"step": 1319
},
{
"epoch": 2.85097192224622,
"grad_norm": 0.11472396552562714,
"learning_rate": 7.499447265286952e-08,
"loss": 0.364,
"step": 1320
},
{
"epoch": 2.853131749460043,
"grad_norm": 0.10750308632850647,
"learning_rate": 7.284172696337688e-08,
"loss": 0.3626,
"step": 1321
},
{
"epoch": 2.8552915766738662,
"grad_norm": 0.11191460490226746,
"learning_rate": 7.072010388340656e-08,
"loss": 0.3623,
"step": 1322
},
{
"epoch": 2.857451403887689,
"grad_norm": 0.10993245989084244,
"learning_rate": 6.862961681428304e-08,
"loss": 0.3549,
"step": 1323
},
{
"epoch": 2.859611231101512,
"grad_norm": 0.11314646899700165,
"learning_rate": 6.657027896065982e-08,
"loss": 0.3542,
"step": 1324
},
{
"epoch": 2.861771058315335,
"grad_norm": 0.1285964399576187,
"learning_rate": 6.454210333043275e-08,
"loss": 0.3572,
"step": 1325
},
{
"epoch": 2.8639308855291574,
"grad_norm": 0.10818547010421753,
"learning_rate": 6.254510273466186e-08,
"loss": 0.3676,
"step": 1326
},
{
"epoch": 2.8660907127429804,
"grad_norm": 0.10412049293518066,
"learning_rate": 6.057928978748906e-08,
"loss": 0.3685,
"step": 1327
},
{
"epoch": 2.8682505399568035,
"grad_norm": 0.10978944599628448,
"learning_rate": 5.864467690605613e-08,
"loss": 0.3671,
"step": 1328
},
{
"epoch": 2.8704103671706265,
"grad_norm": 0.1174926683306694,
"learning_rate": 5.674127631043025e-08,
"loss": 0.3658,
"step": 1329
},
{
"epoch": 2.8725701943844495,
"grad_norm": 0.11143560707569122,
"learning_rate": 5.4869100023523526e-08,
"loss": 0.3624,
"step": 1330
},
{
"epoch": 2.874730021598272,
"grad_norm": 0.10805241763591766,
"learning_rate": 5.302815987101917e-08,
"loss": 0.3636,
"step": 1331
},
{
"epoch": 2.876889848812095,
"grad_norm": 0.11456768959760666,
"learning_rate": 5.121846748129544e-08,
"loss": 0.3537,
"step": 1332
},
{
"epoch": 2.879049676025918,
"grad_norm": 0.1143973246216774,
"learning_rate": 4.944003428535349e-08,
"loss": 0.361,
"step": 1333
},
{
"epoch": 2.8812095032397407,
"grad_norm": 0.11492909491062164,
"learning_rate": 4.769287151674407e-08,
"loss": 0.3529,
"step": 1334
},
{
"epoch": 2.8833693304535637,
"grad_norm": 0.11312732100486755,
"learning_rate": 4.597699021149649e-08,
"loss": 0.3604,
"step": 1335
},
{
"epoch": 2.8855291576673867,
"grad_norm": 0.11396172642707825,
"learning_rate": 4.429240120804923e-08,
"loss": 0.3601,
"step": 1336
},
{
"epoch": 2.8876889848812093,
"grad_norm": 0.10564181953668594,
"learning_rate": 4.263911514718222e-08,
"loss": 0.365,
"step": 1337
},
{
"epoch": 2.8898488120950323,
"grad_norm": 0.11512638628482819,
"learning_rate": 4.10171424719491e-08,
"loss": 0.3658,
"step": 1338
},
{
"epoch": 2.8920086393088553,
"grad_norm": 0.11602869629859924,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.3611,
"step": 1339
},
{
"epoch": 2.8941684665226783,
"grad_norm": 0.11228124052286148,
"learning_rate": 3.786717806157136e-08,
"loss": 0.3615,
"step": 1340
},
{
"epoch": 2.896328293736501,
"grad_norm": 0.1036510244011879,
"learning_rate": 3.633920622331311e-08,
"loss": 0.3621,
"step": 1341
},
{
"epoch": 2.898488120950324,
"grad_norm": 0.11727307736873627,
"learning_rate": 3.4842587564337674e-08,
"loss": 0.3569,
"step": 1342
},
{
"epoch": 2.900647948164147,
"grad_norm": 0.10487374663352966,
"learning_rate": 3.337733153810141e-08,
"loss": 0.362,
"step": 1343
},
{
"epoch": 2.90280777537797,
"grad_norm": 0.10877780616283417,
"learning_rate": 3.194344739995803e-08,
"loss": 0.349,
"step": 1344
},
{
"epoch": 2.9049676025917925,
"grad_norm": 0.11223684251308441,
"learning_rate": 3.054094420709863e-08,
"loss": 0.365,
"step": 1345
},
{
"epoch": 2.9071274298056156,
"grad_norm": 0.103155717253685,
"learning_rate": 2.9169830818496226e-08,
"loss": 0.3592,
"step": 1346
},
{
"epoch": 2.9092872570194386,
"grad_norm": 0.11470197141170502,
"learning_rate": 2.783011589484741e-08,
"loss": 0.3578,
"step": 1347
},
{
"epoch": 2.911447084233261,
"grad_norm": 0.12167331576347351,
"learning_rate": 2.6521807898520214e-08,
"loss": 0.353,
"step": 1348
},
{
"epoch": 2.913606911447084,
"grad_norm": 0.1081666648387909,
"learning_rate": 2.5244915093499134e-08,
"loss": 0.3703,
"step": 1349
},
{
"epoch": 2.915766738660907,
"grad_norm": 0.11052247881889343,
"learning_rate": 2.3999445545332955e-08,
"loss": 0.3593,
"step": 1350
},
{
"epoch": 2.9179265658747298,
"grad_norm": 0.10058227181434631,
"learning_rate": 2.2785407121084236e-08,
"loss": 0.371,
"step": 1351
},
{
"epoch": 2.920086393088553,
"grad_norm": 0.11320126056671143,
"learning_rate": 2.1602807489279344e-08,
"loss": 0.3549,
"step": 1352
},
{
"epoch": 2.922246220302376,
"grad_norm": 0.11561363190412521,
"learning_rate": 2.0451654119860164e-08,
"loss": 0.3578,
"step": 1353
},
{
"epoch": 2.924406047516199,
"grad_norm": 0.10961954295635223,
"learning_rate": 1.9331954284137476e-08,
"loss": 0.3676,
"step": 1354
},
{
"epoch": 2.926565874730022,
"grad_norm": 0.10924555361270905,
"learning_rate": 1.8243715054744315e-08,
"loss": 0.3726,
"step": 1355
},
{
"epoch": 2.9287257019438444,
"grad_norm": 0.10880248993635178,
"learning_rate": 1.71869433055899e-08,
"loss": 0.3547,
"step": 1356
},
{
"epoch": 2.9308855291576674,
"grad_norm": 0.10798300057649612,
"learning_rate": 1.6161645711819664e-08,
"loss": 0.3569,
"step": 1357
},
{
"epoch": 2.9330453563714904,
"grad_norm": 0.11236506700515747,
"learning_rate": 1.5167828749770853e-08,
"loss": 0.3681,
"step": 1358
},
{
"epoch": 2.935205183585313,
"grad_norm": 0.10835976153612137,
"learning_rate": 1.4205498696930332e-08,
"loss": 0.3613,
"step": 1359
},
{
"epoch": 2.937365010799136,
"grad_norm": 0.12566576898097992,
"learning_rate": 1.3274661631899055e-08,
"loss": 0.3637,
"step": 1360
},
{
"epoch": 2.939524838012959,
"grad_norm": 0.10957465320825577,
"learning_rate": 1.2375323434348773e-08,
"loss": 0.3504,
"step": 1361
},
{
"epoch": 2.9416846652267816,
"grad_norm": 0.10701923072338104,
"learning_rate": 1.1507489784989278e-08,
"loss": 0.3607,
"step": 1362
},
{
"epoch": 2.9438444924406046,
"grad_norm": 0.10923349112272263,
"learning_rate": 1.067116616552899e-08,
"loss": 0.3706,
"step": 1363
},
{
"epoch": 2.9460043196544277,
"grad_norm": 0.11076433211565018,
"learning_rate": 9.866357858642206e-09,
"loss": 0.3746,
"step": 1364
},
{
"epoch": 2.9481641468682507,
"grad_norm": 0.10817807167768478,
"learning_rate": 9.09306994793635e-09,
"loss": 0.3648,
"step": 1365
},
{
"epoch": 2.9503239740820737,
"grad_norm": 0.10961637645959854,
"learning_rate": 8.351307317917002e-09,
"loss": 0.3571,
"step": 1366
},
{
"epoch": 2.9524838012958963,
"grad_norm": 0.10122332721948624,
"learning_rate": 7.641074653961244e-09,
"loss": 0.3681,
"step": 1367
},
{
"epoch": 2.9546436285097193,
"grad_norm": 0.10331834852695465,
"learning_rate": 6.962376442284368e-09,
"loss": 0.3566,
"step": 1368
},
{
"epoch": 2.9568034557235423,
"grad_norm": 0.10392733663320541,
"learning_rate": 6.315216969912663e-09,
"loss": 0.3422,
"step": 1369
},
{
"epoch": 2.958963282937365,
"grad_norm": 0.1083427146077156,
"learning_rate": 5.699600324657328e-09,
"loss": 0.3711,
"step": 1370
},
{
"epoch": 2.961123110151188,
"grad_norm": 0.1106184870004654,
"learning_rate": 5.115530395087276e-09,
"loss": 0.3639,
"step": 1371
},
{
"epoch": 2.963282937365011,
"grad_norm": 0.10812865942716599,
"learning_rate": 4.5630108705063684e-09,
"loss": 0.3647,
"step": 1372
},
{
"epoch": 2.9654427645788335,
"grad_norm": 0.11043433845043182,
"learning_rate": 4.042045240927883e-09,
"loss": 0.3706,
"step": 1373
},
{
"epoch": 2.9676025917926565,
"grad_norm": 0.1146334782242775,
"learning_rate": 3.5526367970539765e-09,
"loss": 0.3564,
"step": 1374
},
{
"epoch": 2.9697624190064795,
"grad_norm": 0.11209335923194885,
"learning_rate": 3.094788630254031e-09,
"loss": 0.369,
"step": 1375
},
{
"epoch": 2.971922246220302,
"grad_norm": 0.10334824025630951,
"learning_rate": 2.6685036325457826e-09,
"loss": 0.3614,
"step": 1376
},
{
"epoch": 2.974082073434125,
"grad_norm": 0.12000252306461334,
"learning_rate": 2.2737844965775578e-09,
"loss": 0.3677,
"step": 1377
},
{
"epoch": 2.976241900647948,
"grad_norm": 0.10969026386737823,
"learning_rate": 1.9106337156099553e-09,
"loss": 0.3506,
"step": 1378
},
{
"epoch": 2.978401727861771,
"grad_norm": 0.11796250939369202,
"learning_rate": 1.5790535835003006e-09,
"loss": 0.3698,
"step": 1379
},
{
"epoch": 2.980561555075594,
"grad_norm": 0.11176804453134537,
"learning_rate": 1.2790461946887712e-09,
"loss": 0.3574,
"step": 1380
},
{
"epoch": 2.9827213822894167,
"grad_norm": 0.10805931687355042,
"learning_rate": 1.0106134441850712e-09,
"loss": 0.3732,
"step": 1381
},
{
"epoch": 2.9848812095032398,
"grad_norm": 0.11747419834136963,
"learning_rate": 7.737570275573314e-10,
"loss": 0.3544,
"step": 1382
},
{
"epoch": 2.987041036717063,
"grad_norm": 0.11195072531700134,
"learning_rate": 5.684784409182298e-10,
"loss": 0.3743,
"step": 1383
},
{
"epoch": 2.9892008639308854,
"grad_norm": 0.11737102270126343,
"learning_rate": 3.9477898091944135e-10,
"loss": 0.3672,
"step": 1384
},
{
"epoch": 2.9913606911447084,
"grad_norm": 0.1034155786037445,
"learning_rate": 2.5265974474109054e-10,
"loss": 0.3586,
"step": 1385
},
{
"epoch": 2.9935205183585314,
"grad_norm": 0.11616694182157516,
"learning_rate": 1.4212163008509028e-10,
"loss": 0.36,
"step": 1386
},
{
"epoch": 2.995680345572354,
"grad_norm": 0.11564143747091293,
"learning_rate": 6.316533517125578e-11,
"loss": 0.3624,
"step": 1387
},
{
"epoch": 2.997840172786177,
"grad_norm": 0.11294636130332947,
"learning_rate": 1.57913587295333e-11,
"loss": 0.3607,
"step": 1388
},
{
"epoch": 3.0,
"grad_norm": 0.10668095201253891,
"learning_rate": 0.0,
"loss": 0.3576,
"step": 1389
},
{
"epoch": 3.0,
"step": 1389,
"total_flos": 2.853679693771571e+16,
"train_loss": 0.023199697249737295,
"train_runtime": 5911.0235,
"train_samples_per_second": 90.117,
"train_steps_per_second": 0.235
}
],
"logging_steps": 1,
"max_steps": 1389,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.853679693771571e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}