YouShouChat_PiPi_0.5.1.dev / trainer_state.json
DiDisama's picture
Upload 20 files
c507996 verified
raw
history blame
341 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 19560,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002556237218813906,
"grad_norm": 86.6442642211914,
"learning_rate": 4.601226993865031e-08,
"loss": 4.7575,
"step": 10
},
{
"epoch": 0.005112474437627812,
"grad_norm": 90.80802154541016,
"learning_rate": 9.713701431492844e-08,
"loss": 5.2498,
"step": 20
},
{
"epoch": 0.007668711656441718,
"grad_norm": 79.19834899902344,
"learning_rate": 1.4826175869120655e-07,
"loss": 4.8172,
"step": 30
},
{
"epoch": 0.010224948875255624,
"grad_norm": 93.67054748535156,
"learning_rate": 1.9938650306748468e-07,
"loss": 4.7357,
"step": 40
},
{
"epoch": 0.01278118609406953,
"grad_norm": 84.51275634765625,
"learning_rate": 2.505112474437628e-07,
"loss": 4.5906,
"step": 50
},
{
"epoch": 0.015337423312883436,
"grad_norm": 160.59327697753906,
"learning_rate": 3.0163599182004093e-07,
"loss": 4.4981,
"step": 60
},
{
"epoch": 0.01789366053169734,
"grad_norm": 54.63095474243164,
"learning_rate": 3.52760736196319e-07,
"loss": 3.8254,
"step": 70
},
{
"epoch": 0.02044989775051125,
"grad_norm": 56.51041793823242,
"learning_rate": 4.038854805725972e-07,
"loss": 3.2408,
"step": 80
},
{
"epoch": 0.023006134969325152,
"grad_norm": 54.57204818725586,
"learning_rate": 4.5501022494887533e-07,
"loss": 3.1371,
"step": 90
},
{
"epoch": 0.02556237218813906,
"grad_norm": 17.450857162475586,
"learning_rate": 5.061349693251534e-07,
"loss": 3.0649,
"step": 100
},
{
"epoch": 0.028118609406952964,
"grad_norm": 14.947675704956055,
"learning_rate": 5.572597137014316e-07,
"loss": 2.6627,
"step": 110
},
{
"epoch": 0.03067484662576687,
"grad_norm": 21.680395126342773,
"learning_rate": 6.083844580777097e-07,
"loss": 2.6963,
"step": 120
},
{
"epoch": 0.033231083844580775,
"grad_norm": 14.042488098144531,
"learning_rate": 6.595092024539878e-07,
"loss": 2.7294,
"step": 130
},
{
"epoch": 0.03578732106339468,
"grad_norm": 15.667428016662598,
"learning_rate": 7.106339468302658e-07,
"loss": 2.688,
"step": 140
},
{
"epoch": 0.03834355828220859,
"grad_norm": 15.774909973144531,
"learning_rate": 7.61758691206544e-07,
"loss": 2.5642,
"step": 150
},
{
"epoch": 0.0408997955010225,
"grad_norm": 17.561765670776367,
"learning_rate": 8.128834355828222e-07,
"loss": 2.621,
"step": 160
},
{
"epoch": 0.0434560327198364,
"grad_norm": 22.855037689208984,
"learning_rate": 8.640081799591003e-07,
"loss": 2.5726,
"step": 170
},
{
"epoch": 0.046012269938650305,
"grad_norm": 21.334442138671875,
"learning_rate": 9.151329243353784e-07,
"loss": 2.5354,
"step": 180
},
{
"epoch": 0.04856850715746421,
"grad_norm": 16.87196159362793,
"learning_rate": 9.662576687116565e-07,
"loss": 2.5895,
"step": 190
},
{
"epoch": 0.05112474437627812,
"grad_norm": 13.189834594726562,
"learning_rate": 1.0173824130879346e-06,
"loss": 2.5198,
"step": 200
},
{
"epoch": 0.05368098159509203,
"grad_norm": 32.269676208496094,
"learning_rate": 1.0685071574642128e-06,
"loss": 2.4075,
"step": 210
},
{
"epoch": 0.05623721881390593,
"grad_norm": 35.43799591064453,
"learning_rate": 1.119631901840491e-06,
"loss": 2.3543,
"step": 220
},
{
"epoch": 0.058793456032719835,
"grad_norm": 15.105649948120117,
"learning_rate": 1.170756646216769e-06,
"loss": 2.4616,
"step": 230
},
{
"epoch": 0.06134969325153374,
"grad_norm": 19.825071334838867,
"learning_rate": 1.221881390593047e-06,
"loss": 2.3646,
"step": 240
},
{
"epoch": 0.06390593047034765,
"grad_norm": 14.074588775634766,
"learning_rate": 1.2730061349693252e-06,
"loss": 2.1941,
"step": 250
},
{
"epoch": 0.06646216768916155,
"grad_norm": 21.8532772064209,
"learning_rate": 1.3241308793456035e-06,
"loss": 2.2606,
"step": 260
},
{
"epoch": 0.06901840490797546,
"grad_norm": 22.797710418701172,
"learning_rate": 1.3752556237218813e-06,
"loss": 2.3052,
"step": 270
},
{
"epoch": 0.07157464212678936,
"grad_norm": 19.603389739990234,
"learning_rate": 1.4263803680981596e-06,
"loss": 2.1969,
"step": 280
},
{
"epoch": 0.07413087934560328,
"grad_norm": 17.005781173706055,
"learning_rate": 1.4775051124744377e-06,
"loss": 2.3629,
"step": 290
},
{
"epoch": 0.07668711656441718,
"grad_norm": 15.876945495605469,
"learning_rate": 1.5286298568507158e-06,
"loss": 2.5044,
"step": 300
},
{
"epoch": 0.07924335378323108,
"grad_norm": 17.874849319458008,
"learning_rate": 1.579754601226994e-06,
"loss": 2.145,
"step": 310
},
{
"epoch": 0.081799591002045,
"grad_norm": 22.35564422607422,
"learning_rate": 1.630879345603272e-06,
"loss": 2.1845,
"step": 320
},
{
"epoch": 0.0843558282208589,
"grad_norm": 16.786012649536133,
"learning_rate": 1.6820040899795503e-06,
"loss": 2.1084,
"step": 330
},
{
"epoch": 0.0869120654396728,
"grad_norm": 28.834739685058594,
"learning_rate": 1.7331288343558283e-06,
"loss": 2.2875,
"step": 340
},
{
"epoch": 0.08946830265848671,
"grad_norm": 14.567386627197266,
"learning_rate": 1.7842535787321064e-06,
"loss": 2.3375,
"step": 350
},
{
"epoch": 0.09202453987730061,
"grad_norm": 31.9110164642334,
"learning_rate": 1.8353783231083847e-06,
"loss": 2.1919,
"step": 360
},
{
"epoch": 0.09458077709611452,
"grad_norm": 14.252608299255371,
"learning_rate": 1.8865030674846626e-06,
"loss": 1.9781,
"step": 370
},
{
"epoch": 0.09713701431492842,
"grad_norm": 24.873756408691406,
"learning_rate": 1.937627811860941e-06,
"loss": 2.3583,
"step": 380
},
{
"epoch": 0.09969325153374232,
"grad_norm": 28.036256790161133,
"learning_rate": 1.988752556237219e-06,
"loss": 2.136,
"step": 390
},
{
"epoch": 0.10224948875255624,
"grad_norm": 31.46503257751465,
"learning_rate": 2.039877300613497e-06,
"loss": 2.206,
"step": 400
},
{
"epoch": 0.10480572597137014,
"grad_norm": 26.446863174438477,
"learning_rate": 2.091002044989775e-06,
"loss": 2.1021,
"step": 410
},
{
"epoch": 0.10736196319018405,
"grad_norm": 14.209206581115723,
"learning_rate": 2.142126789366053e-06,
"loss": 2.058,
"step": 420
},
{
"epoch": 0.10991820040899795,
"grad_norm": 16.250686645507812,
"learning_rate": 2.1932515337423317e-06,
"loss": 2.1551,
"step": 430
},
{
"epoch": 0.11247443762781185,
"grad_norm": 14.691299438476562,
"learning_rate": 2.24437627811861e-06,
"loss": 1.9728,
"step": 440
},
{
"epoch": 0.11503067484662577,
"grad_norm": 15.899717330932617,
"learning_rate": 2.2955010224948875e-06,
"loss": 1.9192,
"step": 450
},
{
"epoch": 0.11758691206543967,
"grad_norm": 13.063642501831055,
"learning_rate": 2.346625766871166e-06,
"loss": 2.1218,
"step": 460
},
{
"epoch": 0.12014314928425358,
"grad_norm": 13.922229766845703,
"learning_rate": 2.397750511247444e-06,
"loss": 1.8393,
"step": 470
},
{
"epoch": 0.12269938650306748,
"grad_norm": 20.130887985229492,
"learning_rate": 2.448875255623722e-06,
"loss": 2.0518,
"step": 480
},
{
"epoch": 0.1252556237218814,
"grad_norm": 20.779491424560547,
"learning_rate": 2.5e-06,
"loss": 2.0944,
"step": 490
},
{
"epoch": 0.1278118609406953,
"grad_norm": 16.71235466003418,
"learning_rate": 2.5511247443762783e-06,
"loss": 1.9393,
"step": 500
},
{
"epoch": 0.1303680981595092,
"grad_norm": 29.568498611450195,
"learning_rate": 2.6022494887525564e-06,
"loss": 2.05,
"step": 510
},
{
"epoch": 0.1329243353783231,
"grad_norm": 13.172797203063965,
"learning_rate": 2.653374233128835e-06,
"loss": 2.1483,
"step": 520
},
{
"epoch": 0.13548057259713703,
"grad_norm": 19.142757415771484,
"learning_rate": 2.704498977505113e-06,
"loss": 1.804,
"step": 530
},
{
"epoch": 0.13803680981595093,
"grad_norm": 11.22604751586914,
"learning_rate": 2.7556237218813906e-06,
"loss": 1.8299,
"step": 540
},
{
"epoch": 0.14059304703476483,
"grad_norm": 18.316205978393555,
"learning_rate": 2.8067484662576687e-06,
"loss": 2.0066,
"step": 550
},
{
"epoch": 0.14314928425357873,
"grad_norm": 31.03098487854004,
"learning_rate": 2.8578732106339468e-06,
"loss": 2.1166,
"step": 560
},
{
"epoch": 0.14570552147239263,
"grad_norm": 13.343217849731445,
"learning_rate": 2.9089979550102253e-06,
"loss": 1.9187,
"step": 570
},
{
"epoch": 0.14826175869120656,
"grad_norm": 16.278240203857422,
"learning_rate": 2.9601226993865034e-06,
"loss": 1.8516,
"step": 580
},
{
"epoch": 0.15081799591002046,
"grad_norm": 14.405373573303223,
"learning_rate": 3.0112474437627814e-06,
"loss": 1.8636,
"step": 590
},
{
"epoch": 0.15337423312883436,
"grad_norm": 28.38698387145996,
"learning_rate": 3.0623721881390595e-06,
"loss": 1.897,
"step": 600
},
{
"epoch": 0.15593047034764826,
"grad_norm": 15.13376522064209,
"learning_rate": 3.1134969325153376e-06,
"loss": 1.7026,
"step": 610
},
{
"epoch": 0.15848670756646216,
"grad_norm": 21.627859115600586,
"learning_rate": 3.164621676891616e-06,
"loss": 1.7341,
"step": 620
},
{
"epoch": 0.16104294478527606,
"grad_norm": 34.46842956542969,
"learning_rate": 3.215746421267894e-06,
"loss": 1.668,
"step": 630
},
{
"epoch": 0.16359918200409,
"grad_norm": 15.768085479736328,
"learning_rate": 3.266871165644172e-06,
"loss": 1.5321,
"step": 640
},
{
"epoch": 0.1661554192229039,
"grad_norm": 15.477701187133789,
"learning_rate": 3.31799591002045e-06,
"loss": 1.7124,
"step": 650
},
{
"epoch": 0.1687116564417178,
"grad_norm": 14.961771011352539,
"learning_rate": 3.369120654396728e-06,
"loss": 1.7718,
"step": 660
},
{
"epoch": 0.1712678936605317,
"grad_norm": 16.937185287475586,
"learning_rate": 3.4202453987730065e-06,
"loss": 1.7621,
"step": 670
},
{
"epoch": 0.1738241308793456,
"grad_norm": 13.276283264160156,
"learning_rate": 3.4713701431492846e-06,
"loss": 1.8156,
"step": 680
},
{
"epoch": 0.17638036809815952,
"grad_norm": 20.783721923828125,
"learning_rate": 3.5224948875255627e-06,
"loss": 1.8182,
"step": 690
},
{
"epoch": 0.17893660531697342,
"grad_norm": 15.780200004577637,
"learning_rate": 3.5736196319018408e-06,
"loss": 1.7411,
"step": 700
},
{
"epoch": 0.18149284253578732,
"grad_norm": 14.318155288696289,
"learning_rate": 3.624744376278119e-06,
"loss": 1.8282,
"step": 710
},
{
"epoch": 0.18404907975460122,
"grad_norm": 17.099267959594727,
"learning_rate": 3.6758691206543974e-06,
"loss": 1.8697,
"step": 720
},
{
"epoch": 0.18660531697341512,
"grad_norm": 24.751697540283203,
"learning_rate": 3.7269938650306754e-06,
"loss": 1.585,
"step": 730
},
{
"epoch": 0.18916155419222905,
"grad_norm": 14.778061866760254,
"learning_rate": 3.778118609406953e-06,
"loss": 1.5203,
"step": 740
},
{
"epoch": 0.19171779141104295,
"grad_norm": 11.545260429382324,
"learning_rate": 3.829243353783232e-06,
"loss": 1.85,
"step": 750
},
{
"epoch": 0.19427402862985685,
"grad_norm": 15.764128684997559,
"learning_rate": 3.880368098159509e-06,
"loss": 1.7218,
"step": 760
},
{
"epoch": 0.19683026584867075,
"grad_norm": 19.65780258178711,
"learning_rate": 3.931492842535788e-06,
"loss": 1.7144,
"step": 770
},
{
"epoch": 0.19938650306748465,
"grad_norm": 11.911704063415527,
"learning_rate": 3.982617586912066e-06,
"loss": 1.4344,
"step": 780
},
{
"epoch": 0.20194274028629858,
"grad_norm": 28.547578811645508,
"learning_rate": 4.033742331288344e-06,
"loss": 1.6595,
"step": 790
},
{
"epoch": 0.20449897750511248,
"grad_norm": 21.17897605895996,
"learning_rate": 4.084867075664622e-06,
"loss": 1.5372,
"step": 800
},
{
"epoch": 0.20705521472392638,
"grad_norm": 32.14210510253906,
"learning_rate": 4.1359918200409e-06,
"loss": 1.7575,
"step": 810
},
{
"epoch": 0.20961145194274028,
"grad_norm": 16.999343872070312,
"learning_rate": 4.187116564417179e-06,
"loss": 1.4743,
"step": 820
},
{
"epoch": 0.21216768916155418,
"grad_norm": 16.725637435913086,
"learning_rate": 4.238241308793456e-06,
"loss": 1.7326,
"step": 830
},
{
"epoch": 0.2147239263803681,
"grad_norm": 17.674165725708008,
"learning_rate": 4.289366053169735e-06,
"loss": 1.7185,
"step": 840
},
{
"epoch": 0.217280163599182,
"grad_norm": 14.320542335510254,
"learning_rate": 4.3404907975460124e-06,
"loss": 1.7974,
"step": 850
},
{
"epoch": 0.2198364008179959,
"grad_norm": 15.723745346069336,
"learning_rate": 4.391615541922291e-06,
"loss": 1.475,
"step": 860
},
{
"epoch": 0.2223926380368098,
"grad_norm": 17.50130844116211,
"learning_rate": 4.4427402862985694e-06,
"loss": 1.5188,
"step": 870
},
{
"epoch": 0.2249488752556237,
"grad_norm": 20.599023818969727,
"learning_rate": 4.493865030674847e-06,
"loss": 1.3638,
"step": 880
},
{
"epoch": 0.22750511247443764,
"grad_norm": 20.347301483154297,
"learning_rate": 4.544989775051125e-06,
"loss": 1.6373,
"step": 890
},
{
"epoch": 0.23006134969325154,
"grad_norm": 16.79341697692871,
"learning_rate": 4.596114519427403e-06,
"loss": 1.6475,
"step": 900
},
{
"epoch": 0.23261758691206544,
"grad_norm": 38.66135787963867,
"learning_rate": 4.647239263803681e-06,
"loss": 1.4448,
"step": 910
},
{
"epoch": 0.23517382413087934,
"grad_norm": 39.84195327758789,
"learning_rate": 4.6983640081799594e-06,
"loss": 1.5184,
"step": 920
},
{
"epoch": 0.23773006134969324,
"grad_norm": 13.964972496032715,
"learning_rate": 4.749488752556238e-06,
"loss": 1.506,
"step": 930
},
{
"epoch": 0.24028629856850717,
"grad_norm": 41.46586608886719,
"learning_rate": 4.800613496932516e-06,
"loss": 1.552,
"step": 940
},
{
"epoch": 0.24284253578732107,
"grad_norm": 14.640779495239258,
"learning_rate": 4.851738241308794e-06,
"loss": 1.3879,
"step": 950
},
{
"epoch": 0.24539877300613497,
"grad_norm": 15.459932327270508,
"learning_rate": 4.902862985685072e-06,
"loss": 1.3674,
"step": 960
},
{
"epoch": 0.24795501022494887,
"grad_norm": 15.347355842590332,
"learning_rate": 4.95398773006135e-06,
"loss": 1.1746,
"step": 970
},
{
"epoch": 0.2505112474437628,
"grad_norm": 28.36712074279785,
"learning_rate": 5.005112474437628e-06,
"loss": 1.5726,
"step": 980
},
{
"epoch": 0.25306748466257667,
"grad_norm": 19.360950469970703,
"learning_rate": 5.0562372188139064e-06,
"loss": 1.6336,
"step": 990
},
{
"epoch": 0.2556237218813906,
"grad_norm": 14.269163131713867,
"learning_rate": 5.107361963190185e-06,
"loss": 1.4811,
"step": 1000
},
{
"epoch": 0.2581799591002045,
"grad_norm": 13.799911499023438,
"learning_rate": 5.158486707566463e-06,
"loss": 1.5457,
"step": 1010
},
{
"epoch": 0.2607361963190184,
"grad_norm": 14.226543426513672,
"learning_rate": 5.209611451942741e-06,
"loss": 1.292,
"step": 1020
},
{
"epoch": 0.2632924335378323,
"grad_norm": 17.43229103088379,
"learning_rate": 5.260736196319019e-06,
"loss": 1.4887,
"step": 1030
},
{
"epoch": 0.2658486707566462,
"grad_norm": 14.904557228088379,
"learning_rate": 5.311860940695297e-06,
"loss": 1.4825,
"step": 1040
},
{
"epoch": 0.2684049079754601,
"grad_norm": 14.981825828552246,
"learning_rate": 5.362985685071576e-06,
"loss": 1.3343,
"step": 1050
},
{
"epoch": 0.27096114519427406,
"grad_norm": 17.142181396484375,
"learning_rate": 5.4141104294478534e-06,
"loss": 1.5909,
"step": 1060
},
{
"epoch": 0.27351738241308793,
"grad_norm": 25.113466262817383,
"learning_rate": 5.465235173824132e-06,
"loss": 1.4104,
"step": 1070
},
{
"epoch": 0.27607361963190186,
"grad_norm": 15.018096923828125,
"learning_rate": 5.516359918200409e-06,
"loss": 1.6476,
"step": 1080
},
{
"epoch": 0.27862985685071573,
"grad_norm": 20.576635360717773,
"learning_rate": 5.567484662576687e-06,
"loss": 1.402,
"step": 1090
},
{
"epoch": 0.28118609406952966,
"grad_norm": 22.463077545166016,
"learning_rate": 5.618609406952967e-06,
"loss": 1.4635,
"step": 1100
},
{
"epoch": 0.2837423312883436,
"grad_norm": 24.64898109436035,
"learning_rate": 5.669734151329243e-06,
"loss": 1.4447,
"step": 1110
},
{
"epoch": 0.28629856850715746,
"grad_norm": 13.527003288269043,
"learning_rate": 5.720858895705522e-06,
"loss": 1.3098,
"step": 1120
},
{
"epoch": 0.2888548057259714,
"grad_norm": 28.426868438720703,
"learning_rate": 5.7719836400817996e-06,
"loss": 1.2572,
"step": 1130
},
{
"epoch": 0.29141104294478526,
"grad_norm": 17.067176818847656,
"learning_rate": 5.823108384458078e-06,
"loss": 1.1472,
"step": 1140
},
{
"epoch": 0.2939672801635992,
"grad_norm": 13.91565990447998,
"learning_rate": 5.874233128834357e-06,
"loss": 1.4355,
"step": 1150
},
{
"epoch": 0.2965235173824131,
"grad_norm": 19.00754165649414,
"learning_rate": 5.925357873210634e-06,
"loss": 1.4044,
"step": 1160
},
{
"epoch": 0.299079754601227,
"grad_norm": 23.37032127380371,
"learning_rate": 5.976482617586913e-06,
"loss": 1.0234,
"step": 1170
},
{
"epoch": 0.3016359918200409,
"grad_norm": 16.511402130126953,
"learning_rate": 6.02760736196319e-06,
"loss": 1.3716,
"step": 1180
},
{
"epoch": 0.3041922290388548,
"grad_norm": 27.51507568359375,
"learning_rate": 6.078732106339469e-06,
"loss": 1.19,
"step": 1190
},
{
"epoch": 0.3067484662576687,
"grad_norm": 9.414546012878418,
"learning_rate": 6.129856850715747e-06,
"loss": 1.1535,
"step": 1200
},
{
"epoch": 0.30930470347648265,
"grad_norm": 11.353070259094238,
"learning_rate": 6.180981595092025e-06,
"loss": 1.2848,
"step": 1210
},
{
"epoch": 0.3118609406952965,
"grad_norm": 13.55284595489502,
"learning_rate": 6.232106339468304e-06,
"loss": 1.1883,
"step": 1220
},
{
"epoch": 0.31441717791411045,
"grad_norm": 22.225780487060547,
"learning_rate": 6.283231083844581e-06,
"loss": 1.5084,
"step": 1230
},
{
"epoch": 0.3169734151329243,
"grad_norm": 31.59255027770996,
"learning_rate": 6.33435582822086e-06,
"loss": 1.4056,
"step": 1240
},
{
"epoch": 0.31952965235173825,
"grad_norm": 37.905452728271484,
"learning_rate": 6.385480572597138e-06,
"loss": 1.5661,
"step": 1250
},
{
"epoch": 0.3220858895705521,
"grad_norm": 18.418148040771484,
"learning_rate": 6.436605316973416e-06,
"loss": 1.4067,
"step": 1260
},
{
"epoch": 0.32464212678936605,
"grad_norm": 14.485339164733887,
"learning_rate": 6.487730061349694e-06,
"loss": 1.0165,
"step": 1270
},
{
"epoch": 0.32719836400818,
"grad_norm": 30.809120178222656,
"learning_rate": 6.538854805725971e-06,
"loss": 1.2704,
"step": 1280
},
{
"epoch": 0.32975460122699385,
"grad_norm": 19.595361709594727,
"learning_rate": 6.58997955010225e-06,
"loss": 1.2991,
"step": 1290
},
{
"epoch": 0.3323108384458078,
"grad_norm": 10.220748901367188,
"learning_rate": 6.641104294478529e-06,
"loss": 1.1867,
"step": 1300
},
{
"epoch": 0.33486707566462165,
"grad_norm": 12.308537483215332,
"learning_rate": 6.692229038854806e-06,
"loss": 1.2807,
"step": 1310
},
{
"epoch": 0.3374233128834356,
"grad_norm": 21.010303497314453,
"learning_rate": 6.743353783231084e-06,
"loss": 1.1327,
"step": 1320
},
{
"epoch": 0.3399795501022495,
"grad_norm": 13.279821395874023,
"learning_rate": 6.794478527607362e-06,
"loss": 1.113,
"step": 1330
},
{
"epoch": 0.3425357873210634,
"grad_norm": 16.616683959960938,
"learning_rate": 6.8456032719836406e-06,
"loss": 1.2778,
"step": 1340
},
{
"epoch": 0.3450920245398773,
"grad_norm": 14.577863693237305,
"learning_rate": 6.896728016359919e-06,
"loss": 1.1638,
"step": 1350
},
{
"epoch": 0.3476482617586912,
"grad_norm": 19.595500946044922,
"learning_rate": 6.947852760736197e-06,
"loss": 1.374,
"step": 1360
},
{
"epoch": 0.3502044989775051,
"grad_norm": 19.486886978149414,
"learning_rate": 6.998977505112475e-06,
"loss": 1.2494,
"step": 1370
},
{
"epoch": 0.35276073619631904,
"grad_norm": 15.864728927612305,
"learning_rate": 7.050102249488753e-06,
"loss": 1.2167,
"step": 1380
},
{
"epoch": 0.3553169734151329,
"grad_norm": 11.7051362991333,
"learning_rate": 7.101226993865031e-06,
"loss": 1.2047,
"step": 1390
},
{
"epoch": 0.35787321063394684,
"grad_norm": 27.072895050048828,
"learning_rate": 7.15235173824131e-06,
"loss": 1.1739,
"step": 1400
},
{
"epoch": 0.3604294478527607,
"grad_norm": 13.395477294921875,
"learning_rate": 7.2034764826175876e-06,
"loss": 1.4076,
"step": 1410
},
{
"epoch": 0.36298568507157464,
"grad_norm": 11.141236305236816,
"learning_rate": 7.254601226993866e-06,
"loss": 1.2486,
"step": 1420
},
{
"epoch": 0.36554192229038857,
"grad_norm": 29.229612350463867,
"learning_rate": 7.305725971370144e-06,
"loss": 1.2913,
"step": 1430
},
{
"epoch": 0.36809815950920244,
"grad_norm": 13.788121223449707,
"learning_rate": 7.356850715746422e-06,
"loss": 1.2888,
"step": 1440
},
{
"epoch": 0.37065439672801637,
"grad_norm": 22.21321678161621,
"learning_rate": 7.407975460122701e-06,
"loss": 1.2711,
"step": 1450
},
{
"epoch": 0.37321063394683024,
"grad_norm": 15.443243980407715,
"learning_rate": 7.459100204498978e-06,
"loss": 1.0954,
"step": 1460
},
{
"epoch": 0.37576687116564417,
"grad_norm": 16.390304565429688,
"learning_rate": 7.510224948875257e-06,
"loss": 1.267,
"step": 1470
},
{
"epoch": 0.3783231083844581,
"grad_norm": 25.774921417236328,
"learning_rate": 7.561349693251534e-06,
"loss": 1.4799,
"step": 1480
},
{
"epoch": 0.38087934560327197,
"grad_norm": 12.54340648651123,
"learning_rate": 7.612474437627812e-06,
"loss": 1.3772,
"step": 1490
},
{
"epoch": 0.3834355828220859,
"grad_norm": 22.544086456298828,
"learning_rate": 7.663599182004092e-06,
"loss": 1.0647,
"step": 1500
},
{
"epoch": 0.38599182004089977,
"grad_norm": 18.576513290405273,
"learning_rate": 7.714723926380368e-06,
"loss": 1.0959,
"step": 1510
},
{
"epoch": 0.3885480572597137,
"grad_norm": 29.345508575439453,
"learning_rate": 7.765848670756647e-06,
"loss": 0.7637,
"step": 1520
},
{
"epoch": 0.3911042944785276,
"grad_norm": 17.49864387512207,
"learning_rate": 7.816973415132925e-06,
"loss": 1.0717,
"step": 1530
},
{
"epoch": 0.3936605316973415,
"grad_norm": 16.895015716552734,
"learning_rate": 7.868098159509204e-06,
"loss": 1.1233,
"step": 1540
},
{
"epoch": 0.3962167689161554,
"grad_norm": 13.488862991333008,
"learning_rate": 7.919222903885482e-06,
"loss": 1.2759,
"step": 1550
},
{
"epoch": 0.3987730061349693,
"grad_norm": 32.994239807128906,
"learning_rate": 7.97034764826176e-06,
"loss": 1.3437,
"step": 1560
},
{
"epoch": 0.4013292433537832,
"grad_norm": 16.069793701171875,
"learning_rate": 8.021472392638038e-06,
"loss": 1.2101,
"step": 1570
},
{
"epoch": 0.40388548057259716,
"grad_norm": 12.347966194152832,
"learning_rate": 8.072597137014315e-06,
"loss": 0.8976,
"step": 1580
},
{
"epoch": 0.40644171779141103,
"grad_norm": 19.300352096557617,
"learning_rate": 8.123721881390593e-06,
"loss": 1.0809,
"step": 1590
},
{
"epoch": 0.40899795501022496,
"grad_norm": 18.28475570678711,
"learning_rate": 8.174846625766872e-06,
"loss": 1.1435,
"step": 1600
},
{
"epoch": 0.41155419222903883,
"grad_norm": 32.143680572509766,
"learning_rate": 8.22597137014315e-06,
"loss": 1.1703,
"step": 1610
},
{
"epoch": 0.41411042944785276,
"grad_norm": 16.422698974609375,
"learning_rate": 8.277096114519429e-06,
"loss": 0.8745,
"step": 1620
},
{
"epoch": 0.4166666666666667,
"grad_norm": 29.04837989807129,
"learning_rate": 8.328220858895705e-06,
"loss": 0.929,
"step": 1630
},
{
"epoch": 0.41922290388548056,
"grad_norm": 18.266582489013672,
"learning_rate": 8.379345603271984e-06,
"loss": 1.0091,
"step": 1640
},
{
"epoch": 0.4217791411042945,
"grad_norm": 15.355749130249023,
"learning_rate": 8.430470347648262e-06,
"loss": 0.9601,
"step": 1650
},
{
"epoch": 0.42433537832310836,
"grad_norm": 11.973981857299805,
"learning_rate": 8.481595092024541e-06,
"loss": 1.0835,
"step": 1660
},
{
"epoch": 0.4268916155419223,
"grad_norm": 17.572921752929688,
"learning_rate": 8.53271983640082e-06,
"loss": 1.0341,
"step": 1670
},
{
"epoch": 0.4294478527607362,
"grad_norm": 13.629963874816895,
"learning_rate": 8.583844580777096e-06,
"loss": 1.1951,
"step": 1680
},
{
"epoch": 0.4320040899795501,
"grad_norm": 10.527235984802246,
"learning_rate": 8.634969325153375e-06,
"loss": 1.0702,
"step": 1690
},
{
"epoch": 0.434560327198364,
"grad_norm": 17.04031753540039,
"learning_rate": 8.686094069529653e-06,
"loss": 1.2097,
"step": 1700
},
{
"epoch": 0.4371165644171779,
"grad_norm": 11.430649757385254,
"learning_rate": 8.737218813905932e-06,
"loss": 0.7584,
"step": 1710
},
{
"epoch": 0.4396728016359918,
"grad_norm": 10.45757007598877,
"learning_rate": 8.78834355828221e-06,
"loss": 1.1054,
"step": 1720
},
{
"epoch": 0.44222903885480574,
"grad_norm": 17.184608459472656,
"learning_rate": 8.839468302658487e-06,
"loss": 0.8942,
"step": 1730
},
{
"epoch": 0.4447852760736196,
"grad_norm": 11.653769493103027,
"learning_rate": 8.890593047034766e-06,
"loss": 1.2842,
"step": 1740
},
{
"epoch": 0.44734151329243355,
"grad_norm": 17.205242156982422,
"learning_rate": 8.941717791411042e-06,
"loss": 0.9786,
"step": 1750
},
{
"epoch": 0.4498977505112474,
"grad_norm": 27.1918888092041,
"learning_rate": 8.992842535787321e-06,
"loss": 1.2404,
"step": 1760
},
{
"epoch": 0.45245398773006135,
"grad_norm": 14.006787300109863,
"learning_rate": 9.043967280163601e-06,
"loss": 1.0146,
"step": 1770
},
{
"epoch": 0.4550102249488753,
"grad_norm": 21.269569396972656,
"learning_rate": 9.095092024539878e-06,
"loss": 0.9494,
"step": 1780
},
{
"epoch": 0.45756646216768915,
"grad_norm": 13.8292236328125,
"learning_rate": 9.146216768916156e-06,
"loss": 1.2056,
"step": 1790
},
{
"epoch": 0.4601226993865031,
"grad_norm": 11.28924560546875,
"learning_rate": 9.197341513292433e-06,
"loss": 1.1219,
"step": 1800
},
{
"epoch": 0.46267893660531695,
"grad_norm": 24.358989715576172,
"learning_rate": 9.248466257668712e-06,
"loss": 1.0259,
"step": 1810
},
{
"epoch": 0.4652351738241309,
"grad_norm": 16.623613357543945,
"learning_rate": 9.29959100204499e-06,
"loss": 0.8764,
"step": 1820
},
{
"epoch": 0.4677914110429448,
"grad_norm": 11.915813446044922,
"learning_rate": 9.350715746421269e-06,
"loss": 0.9114,
"step": 1830
},
{
"epoch": 0.4703476482617587,
"grad_norm": 14.000443458557129,
"learning_rate": 9.401840490797547e-06,
"loss": 0.9903,
"step": 1840
},
{
"epoch": 0.4729038854805726,
"grad_norm": 9.23658561706543,
"learning_rate": 9.452965235173824e-06,
"loss": 1.1503,
"step": 1850
},
{
"epoch": 0.4754601226993865,
"grad_norm": 14.627740859985352,
"learning_rate": 9.504089979550103e-06,
"loss": 1.0605,
"step": 1860
},
{
"epoch": 0.4780163599182004,
"grad_norm": 13.077226638793945,
"learning_rate": 9.555214723926381e-06,
"loss": 0.9759,
"step": 1870
},
{
"epoch": 0.48057259713701433,
"grad_norm": 9.975872993469238,
"learning_rate": 9.60633946830266e-06,
"loss": 0.9908,
"step": 1880
},
{
"epoch": 0.4831288343558282,
"grad_norm": 15.750456809997559,
"learning_rate": 9.657464212678938e-06,
"loss": 1.0758,
"step": 1890
},
{
"epoch": 0.48568507157464214,
"grad_norm": 10.907366752624512,
"learning_rate": 9.708588957055215e-06,
"loss": 0.8757,
"step": 1900
},
{
"epoch": 0.488241308793456,
"grad_norm": 26.87792205810547,
"learning_rate": 9.759713701431493e-06,
"loss": 0.9745,
"step": 1910
},
{
"epoch": 0.49079754601226994,
"grad_norm": 10.880130767822266,
"learning_rate": 9.810838445807772e-06,
"loss": 0.9391,
"step": 1920
},
{
"epoch": 0.49335378323108386,
"grad_norm": 19.826669692993164,
"learning_rate": 9.86196319018405e-06,
"loss": 1.1917,
"step": 1930
},
{
"epoch": 0.49591002044989774,
"grad_norm": 11.035025596618652,
"learning_rate": 9.913087934560329e-06,
"loss": 0.7836,
"step": 1940
},
{
"epoch": 0.49846625766871167,
"grad_norm": 13.407333374023438,
"learning_rate": 9.964212678936606e-06,
"loss": 1.1624,
"step": 1950
},
{
"epoch": 0.5010224948875256,
"grad_norm": 18.5594482421875,
"learning_rate": 9.999999283428496e-06,
"loss": 1.0359,
"step": 1960
},
{
"epoch": 0.5035787321063395,
"grad_norm": 30.378826141357422,
"learning_rate": 9.999986544385255e-06,
"loss": 0.9342,
"step": 1970
},
{
"epoch": 0.5061349693251533,
"grad_norm": 26.27793312072754,
"learning_rate": 9.99995788157752e-06,
"loss": 0.7684,
"step": 1980
},
{
"epoch": 0.5086912065439673,
"grad_norm": 17.525869369506836,
"learning_rate": 9.999913295096573e-06,
"loss": 1.2072,
"step": 1990
},
{
"epoch": 0.5112474437627812,
"grad_norm": 19.318090438842773,
"learning_rate": 9.999852785084414e-06,
"loss": 0.9006,
"step": 2000
},
{
"epoch": 0.5138036809815951,
"grad_norm": 11.649446487426758,
"learning_rate": 9.999776351733751e-06,
"loss": 0.831,
"step": 2010
},
{
"epoch": 0.516359918200409,
"grad_norm": 18.077003479003906,
"learning_rate": 9.999683995288008e-06,
"loss": 0.8372,
"step": 2020
},
{
"epoch": 0.5189161554192229,
"grad_norm": 24.69324493408203,
"learning_rate": 9.999575716041316e-06,
"loss": 1.0961,
"step": 2030
},
{
"epoch": 0.5214723926380368,
"grad_norm": 10.308004379272461,
"learning_rate": 9.99945151433852e-06,
"loss": 1.0896,
"step": 2040
},
{
"epoch": 0.5240286298568507,
"grad_norm": 14.579326629638672,
"learning_rate": 9.99931139057517e-06,
"loss": 0.8101,
"step": 2050
},
{
"epoch": 0.5265848670756647,
"grad_norm": 19.19144630432129,
"learning_rate": 9.999155345197531e-06,
"loss": 0.9718,
"step": 2060
},
{
"epoch": 0.5291411042944786,
"grad_norm": 14.424161911010742,
"learning_rate": 9.99898337870257e-06,
"loss": 1.1082,
"step": 2070
},
{
"epoch": 0.5316973415132924,
"grad_norm": 11.568525314331055,
"learning_rate": 9.998795491637956e-06,
"loss": 0.9928,
"step": 2080
},
{
"epoch": 0.5342535787321063,
"grad_norm": 28.195453643798828,
"learning_rate": 9.998591684602065e-06,
"loss": 0.967,
"step": 2090
},
{
"epoch": 0.5368098159509203,
"grad_norm": 11.809616088867188,
"learning_rate": 9.998371958243977e-06,
"loss": 0.8879,
"step": 2100
},
{
"epoch": 0.5393660531697342,
"grad_norm": 11.77135944366455,
"learning_rate": 9.998136313263465e-06,
"loss": 1.0883,
"step": 2110
},
{
"epoch": 0.5419222903885481,
"grad_norm": 17.555498123168945,
"learning_rate": 9.997884750411004e-06,
"loss": 1.0922,
"step": 2120
},
{
"epoch": 0.5444785276073619,
"grad_norm": 11.646632194519043,
"learning_rate": 9.997617270487761e-06,
"loss": 0.831,
"step": 2130
},
{
"epoch": 0.5470347648261759,
"grad_norm": 11.330808639526367,
"learning_rate": 9.997333874345594e-06,
"loss": 1.1629,
"step": 2140
},
{
"epoch": 0.5495910020449898,
"grad_norm": 12.656023979187012,
"learning_rate": 9.997034562887054e-06,
"loss": 1.1112,
"step": 2150
},
{
"epoch": 0.5521472392638037,
"grad_norm": 10.297701835632324,
"learning_rate": 9.996719337065376e-06,
"loss": 0.9942,
"step": 2160
},
{
"epoch": 0.5547034764826176,
"grad_norm": 20.408578872680664,
"learning_rate": 9.99638819788448e-06,
"loss": 0.7756,
"step": 2170
},
{
"epoch": 0.5572597137014315,
"grad_norm": 13.656134605407715,
"learning_rate": 9.996041146398963e-06,
"loss": 1.2323,
"step": 2180
},
{
"epoch": 0.5598159509202454,
"grad_norm": 10.573500633239746,
"learning_rate": 9.995678183714104e-06,
"loss": 0.9494,
"step": 2190
},
{
"epoch": 0.5623721881390593,
"grad_norm": 30.932117462158203,
"learning_rate": 9.99529931098585e-06,
"loss": 0.9215,
"step": 2200
},
{
"epoch": 0.5649284253578732,
"grad_norm": 12.926258087158203,
"learning_rate": 9.994904529420824e-06,
"loss": 1.151,
"step": 2210
},
{
"epoch": 0.5674846625766872,
"grad_norm": 9.75345516204834,
"learning_rate": 9.994493840276308e-06,
"loss": 1.0613,
"step": 2220
},
{
"epoch": 0.570040899795501,
"grad_norm": 15.309710502624512,
"learning_rate": 9.99406724486025e-06,
"loss": 1.1024,
"step": 2230
},
{
"epoch": 0.5725971370143149,
"grad_norm": 13.060432434082031,
"learning_rate": 9.993624744531253e-06,
"loss": 0.8317,
"step": 2240
},
{
"epoch": 0.5751533742331288,
"grad_norm": 22.823984146118164,
"learning_rate": 9.993166340698577e-06,
"loss": 0.9703,
"step": 2250
},
{
"epoch": 0.5777096114519428,
"grad_norm": 11.097712516784668,
"learning_rate": 9.992692034822127e-06,
"loss": 0.9237,
"step": 2260
},
{
"epoch": 0.5802658486707567,
"grad_norm": 14.171446800231934,
"learning_rate": 9.992201828412458e-06,
"loss": 0.9436,
"step": 2270
},
{
"epoch": 0.5828220858895705,
"grad_norm": 10.901077270507812,
"learning_rate": 9.991695723030755e-06,
"loss": 0.9086,
"step": 2280
},
{
"epoch": 0.5853783231083844,
"grad_norm": 44.82511901855469,
"learning_rate": 9.991173720288847e-06,
"loss": 0.7686,
"step": 2290
},
{
"epoch": 0.5879345603271984,
"grad_norm": 8.220059394836426,
"learning_rate": 9.990635821849187e-06,
"loss": 0.7624,
"step": 2300
},
{
"epoch": 0.5904907975460123,
"grad_norm": 11.58703327178955,
"learning_rate": 9.990082029424852e-06,
"loss": 0.7953,
"step": 2310
},
{
"epoch": 0.5930470347648262,
"grad_norm": 18.552797317504883,
"learning_rate": 9.989512344779541e-06,
"loss": 0.7791,
"step": 2320
},
{
"epoch": 0.59560327198364,
"grad_norm": 16.435989379882812,
"learning_rate": 9.988926769727563e-06,
"loss": 1.1133,
"step": 2330
},
{
"epoch": 0.598159509202454,
"grad_norm": 9.04973316192627,
"learning_rate": 9.988325306133832e-06,
"loss": 0.8,
"step": 2340
},
{
"epoch": 0.6007157464212679,
"grad_norm": 9.818502426147461,
"learning_rate": 9.987707955913873e-06,
"loss": 0.7636,
"step": 2350
},
{
"epoch": 0.6032719836400818,
"grad_norm": 8.12960147857666,
"learning_rate": 9.98707472103379e-06,
"loss": 0.7332,
"step": 2360
},
{
"epoch": 0.6058282208588958,
"grad_norm": 14.352721214294434,
"learning_rate": 9.986425603510292e-06,
"loss": 0.7819,
"step": 2370
},
{
"epoch": 0.6083844580777096,
"grad_norm": 6.8704986572265625,
"learning_rate": 9.985760605410662e-06,
"loss": 0.7691,
"step": 2380
},
{
"epoch": 0.6109406952965235,
"grad_norm": 10.685389518737793,
"learning_rate": 9.985079728852759e-06,
"loss": 0.8252,
"step": 2390
},
{
"epoch": 0.6134969325153374,
"grad_norm": 16.207923889160156,
"learning_rate": 9.98438297600501e-06,
"loss": 0.9821,
"step": 2400
},
{
"epoch": 0.6160531697341514,
"grad_norm": 15.584657669067383,
"learning_rate": 9.983670349086413e-06,
"loss": 0.876,
"step": 2410
},
{
"epoch": 0.6186094069529653,
"grad_norm": 15.134186744689941,
"learning_rate": 9.982941850366513e-06,
"loss": 0.5934,
"step": 2420
},
{
"epoch": 0.6211656441717791,
"grad_norm": 28.123193740844727,
"learning_rate": 9.982197482165398e-06,
"loss": 0.7742,
"step": 2430
},
{
"epoch": 0.623721881390593,
"grad_norm": 17.409650802612305,
"learning_rate": 9.981437246853712e-06,
"loss": 0.7065,
"step": 2440
},
{
"epoch": 0.626278118609407,
"grad_norm": 13.156755447387695,
"learning_rate": 9.980661146852619e-06,
"loss": 0.6499,
"step": 2450
},
{
"epoch": 0.6288343558282209,
"grad_norm": 20.250652313232422,
"learning_rate": 9.979869184633812e-06,
"loss": 0.7821,
"step": 2460
},
{
"epoch": 0.6313905930470347,
"grad_norm": 52.275699615478516,
"learning_rate": 9.979061362719502e-06,
"loss": 0.8,
"step": 2470
},
{
"epoch": 0.6339468302658486,
"grad_norm": 10.591206550598145,
"learning_rate": 9.97823768368241e-06,
"loss": 1.0135,
"step": 2480
},
{
"epoch": 0.6365030674846626,
"grad_norm": 20.04345703125,
"learning_rate": 9.977398150145758e-06,
"loss": 0.9202,
"step": 2490
},
{
"epoch": 0.6390593047034765,
"grad_norm": 15.350805282592773,
"learning_rate": 9.976542764783256e-06,
"loss": 1.0958,
"step": 2500
},
{
"epoch": 0.6416155419222904,
"grad_norm": 10.294832229614258,
"learning_rate": 9.97567153031911e-06,
"loss": 0.9347,
"step": 2510
},
{
"epoch": 0.6441717791411042,
"grad_norm": 18.00196075439453,
"learning_rate": 9.974784449527984e-06,
"loss": 0.776,
"step": 2520
},
{
"epoch": 0.6467280163599182,
"grad_norm": 15.802022933959961,
"learning_rate": 9.973881525235028e-06,
"loss": 0.7016,
"step": 2530
},
{
"epoch": 0.6492842535787321,
"grad_norm": 16.474000930786133,
"learning_rate": 9.972962760315834e-06,
"loss": 0.9632,
"step": 2540
},
{
"epoch": 0.651840490797546,
"grad_norm": 20.025535583496094,
"learning_rate": 9.972028157696452e-06,
"loss": 0.9582,
"step": 2550
},
{
"epoch": 0.65439672801636,
"grad_norm": 20.044818878173828,
"learning_rate": 9.971077720353368e-06,
"loss": 0.8913,
"step": 2560
},
{
"epoch": 0.6569529652351738,
"grad_norm": 10.750015258789062,
"learning_rate": 9.970111451313498e-06,
"loss": 0.9251,
"step": 2570
},
{
"epoch": 0.6595092024539877,
"grad_norm": 13.033714294433594,
"learning_rate": 9.969129353654179e-06,
"loss": 0.8761,
"step": 2580
},
{
"epoch": 0.6620654396728016,
"grad_norm": 9.243477821350098,
"learning_rate": 9.968131430503157e-06,
"loss": 0.5353,
"step": 2590
},
{
"epoch": 0.6646216768916156,
"grad_norm": 8.169621467590332,
"learning_rate": 9.96711768503858e-06,
"loss": 0.6617,
"step": 2600
},
{
"epoch": 0.6671779141104295,
"grad_norm": 21.10552406311035,
"learning_rate": 9.966088120488985e-06,
"loss": 0.5695,
"step": 2610
},
{
"epoch": 0.6697341513292433,
"grad_norm": 9.105271339416504,
"learning_rate": 9.96504274013329e-06,
"loss": 0.9342,
"step": 2620
},
{
"epoch": 0.6722903885480572,
"grad_norm": 12.127760887145996,
"learning_rate": 9.96398154730078e-06,
"loss": 0.8841,
"step": 2630
},
{
"epoch": 0.6748466257668712,
"grad_norm": 6.325476169586182,
"learning_rate": 9.962904545371104e-06,
"loss": 0.6288,
"step": 2640
},
{
"epoch": 0.6774028629856851,
"grad_norm": 36.65105438232422,
"learning_rate": 9.961811737774256e-06,
"loss": 0.7858,
"step": 2650
},
{
"epoch": 0.679959100204499,
"grad_norm": 12.881020545959473,
"learning_rate": 9.960703127990564e-06,
"loss": 0.6614,
"step": 2660
},
{
"epoch": 0.6825153374233128,
"grad_norm": 9.100659370422363,
"learning_rate": 9.959578719550689e-06,
"loss": 0.753,
"step": 2670
},
{
"epoch": 0.6850715746421268,
"grad_norm": 6.299210071563721,
"learning_rate": 9.958438516035604e-06,
"loss": 0.7298,
"step": 2680
},
{
"epoch": 0.6876278118609407,
"grad_norm": 10.514267921447754,
"learning_rate": 9.957282521076583e-06,
"loss": 0.7337,
"step": 2690
},
{
"epoch": 0.6901840490797546,
"grad_norm": 6.144178867340088,
"learning_rate": 9.956110738355197e-06,
"loss": 0.7576,
"step": 2700
},
{
"epoch": 0.6927402862985685,
"grad_norm": 7.862902641296387,
"learning_rate": 9.95492317160329e-06,
"loss": 0.8132,
"step": 2710
},
{
"epoch": 0.6952965235173824,
"grad_norm": 15.029640197753906,
"learning_rate": 9.953719824602982e-06,
"loss": 0.7462,
"step": 2720
},
{
"epoch": 0.6978527607361963,
"grad_norm": 13.379220008850098,
"learning_rate": 9.952500701186649e-06,
"loss": 0.4353,
"step": 2730
},
{
"epoch": 0.7004089979550102,
"grad_norm": 8.90844440460205,
"learning_rate": 9.951265805236903e-06,
"loss": 0.6655,
"step": 2740
},
{
"epoch": 0.7029652351738241,
"grad_norm": 14.42451000213623,
"learning_rate": 9.950015140686595e-06,
"loss": 0.6928,
"step": 2750
},
{
"epoch": 0.7055214723926381,
"grad_norm": 9.552287101745605,
"learning_rate": 9.948748711518792e-06,
"loss": 0.5294,
"step": 2760
},
{
"epoch": 0.7080777096114519,
"grad_norm": 12.426175117492676,
"learning_rate": 9.947466521766772e-06,
"loss": 0.7148,
"step": 2770
},
{
"epoch": 0.7106339468302658,
"grad_norm": 16.0783748626709,
"learning_rate": 9.946168575514e-06,
"loss": 0.6684,
"step": 2780
},
{
"epoch": 0.7131901840490797,
"grad_norm": 10.560613632202148,
"learning_rate": 9.94485487689413e-06,
"loss": 0.7561,
"step": 2790
},
{
"epoch": 0.7157464212678937,
"grad_norm": 13.276518821716309,
"learning_rate": 9.943525430090973e-06,
"loss": 0.5811,
"step": 2800
},
{
"epoch": 0.7183026584867076,
"grad_norm": 13.999181747436523,
"learning_rate": 9.942180239338503e-06,
"loss": 0.5591,
"step": 2810
},
{
"epoch": 0.7208588957055214,
"grad_norm": 12.428943634033203,
"learning_rate": 9.940819308920832e-06,
"loss": 0.7026,
"step": 2820
},
{
"epoch": 0.7234151329243353,
"grad_norm": 7.707891941070557,
"learning_rate": 9.939442643172197e-06,
"loss": 0.7179,
"step": 2830
},
{
"epoch": 0.7259713701431493,
"grad_norm": 7.399072170257568,
"learning_rate": 9.93805024647695e-06,
"loss": 0.664,
"step": 2840
},
{
"epoch": 0.7285276073619632,
"grad_norm": 23.526582717895508,
"learning_rate": 9.936642123269546e-06,
"loss": 0.7611,
"step": 2850
},
{
"epoch": 0.7310838445807771,
"grad_norm": 9.424376487731934,
"learning_rate": 9.93521827803452e-06,
"loss": 0.7113,
"step": 2860
},
{
"epoch": 0.733640081799591,
"grad_norm": 13.683032989501953,
"learning_rate": 9.933778715306474e-06,
"loss": 0.4565,
"step": 2870
},
{
"epoch": 0.7361963190184049,
"grad_norm": 8.428793907165527,
"learning_rate": 9.932323439670079e-06,
"loss": 0.6818,
"step": 2880
},
{
"epoch": 0.7387525562372188,
"grad_norm": 20.064414978027344,
"learning_rate": 9.930852455760039e-06,
"loss": 0.6954,
"step": 2890
},
{
"epoch": 0.7413087934560327,
"grad_norm": 12.071993827819824,
"learning_rate": 9.929365768261085e-06,
"loss": 0.8114,
"step": 2900
},
{
"epoch": 0.7438650306748467,
"grad_norm": 10.930386543273926,
"learning_rate": 9.927863381907963e-06,
"loss": 0.7282,
"step": 2910
},
{
"epoch": 0.7464212678936605,
"grad_norm": 10.170836448669434,
"learning_rate": 9.926345301485414e-06,
"loss": 0.9321,
"step": 2920
},
{
"epoch": 0.7489775051124744,
"grad_norm": 6.6626129150390625,
"learning_rate": 9.924811531828164e-06,
"loss": 0.7144,
"step": 2930
},
{
"epoch": 0.7515337423312883,
"grad_norm": 8.486347198486328,
"learning_rate": 9.923262077820903e-06,
"loss": 0.5393,
"step": 2940
},
{
"epoch": 0.7540899795501023,
"grad_norm": 12.877697944641113,
"learning_rate": 9.921696944398274e-06,
"loss": 0.4268,
"step": 2950
},
{
"epoch": 0.7566462167689162,
"grad_norm": 11.594487190246582,
"learning_rate": 9.920116136544849e-06,
"loss": 0.5911,
"step": 2960
},
{
"epoch": 0.75920245398773,
"grad_norm": 15.745911598205566,
"learning_rate": 9.918519659295127e-06,
"loss": 0.7711,
"step": 2970
},
{
"epoch": 0.7617586912065439,
"grad_norm": 13.972307205200195,
"learning_rate": 9.916907517733508e-06,
"loss": 0.5574,
"step": 2980
},
{
"epoch": 0.7643149284253579,
"grad_norm": 6.976569175720215,
"learning_rate": 9.915279716994276e-06,
"loss": 0.4998,
"step": 2990
},
{
"epoch": 0.7668711656441718,
"grad_norm": 6.9776787757873535,
"learning_rate": 9.913636262261592e-06,
"loss": 0.4069,
"step": 3000
},
{
"epoch": 0.7694274028629857,
"grad_norm": 12.110786437988281,
"learning_rate": 9.911977158769461e-06,
"loss": 0.6704,
"step": 3010
},
{
"epoch": 0.7719836400817995,
"grad_norm": 6.544830799102783,
"learning_rate": 9.910302411801738e-06,
"loss": 0.5889,
"step": 3020
},
{
"epoch": 0.7745398773006135,
"grad_norm": 8.968564987182617,
"learning_rate": 9.90861202669209e-06,
"loss": 0.4109,
"step": 3030
},
{
"epoch": 0.7770961145194274,
"grad_norm": 16.600383758544922,
"learning_rate": 9.906906008823989e-06,
"loss": 0.9562,
"step": 3040
},
{
"epoch": 0.7796523517382413,
"grad_norm": 21.926057815551758,
"learning_rate": 9.905184363630698e-06,
"loss": 0.5117,
"step": 3050
},
{
"epoch": 0.7822085889570553,
"grad_norm": 13.331565856933594,
"learning_rate": 9.903447096595245e-06,
"loss": 0.5186,
"step": 3060
},
{
"epoch": 0.7847648261758691,
"grad_norm": 10.782326698303223,
"learning_rate": 9.90169421325041e-06,
"loss": 0.5799,
"step": 3070
},
{
"epoch": 0.787321063394683,
"grad_norm": 20.489850997924805,
"learning_rate": 9.89992571917871e-06,
"loss": 0.4487,
"step": 3080
},
{
"epoch": 0.7898773006134969,
"grad_norm": 12.216683387756348,
"learning_rate": 9.898141620012374e-06,
"loss": 0.636,
"step": 3090
},
{
"epoch": 0.7924335378323109,
"grad_norm": 8.060449600219727,
"learning_rate": 9.896341921433337e-06,
"loss": 0.6251,
"step": 3100
},
{
"epoch": 0.7949897750511248,
"grad_norm": 5.005650997161865,
"learning_rate": 9.894526629173204e-06,
"loss": 0.6748,
"step": 3110
},
{
"epoch": 0.7975460122699386,
"grad_norm": 11.046931266784668,
"learning_rate": 9.892695749013253e-06,
"loss": 0.599,
"step": 3120
},
{
"epoch": 0.8001022494887525,
"grad_norm": 11.397811889648438,
"learning_rate": 9.890849286784398e-06,
"loss": 0.7874,
"step": 3130
},
{
"epoch": 0.8026584867075665,
"grad_norm": 8.473251342773438,
"learning_rate": 9.888987248367181e-06,
"loss": 0.6328,
"step": 3140
},
{
"epoch": 0.8052147239263804,
"grad_norm": 11.444445610046387,
"learning_rate": 9.88710963969175e-06,
"loss": 0.5749,
"step": 3150
},
{
"epoch": 0.8077709611451943,
"grad_norm": 8.93635082244873,
"learning_rate": 9.885216466737843e-06,
"loss": 0.7803,
"step": 3160
},
{
"epoch": 0.8103271983640081,
"grad_norm": 8.53089714050293,
"learning_rate": 9.883307735534761e-06,
"loss": 0.6362,
"step": 3170
},
{
"epoch": 0.8128834355828221,
"grad_norm": 4.943642616271973,
"learning_rate": 9.88138345216136e-06,
"loss": 0.6297,
"step": 3180
},
{
"epoch": 0.815439672801636,
"grad_norm": 10.993963241577148,
"learning_rate": 9.87944362274602e-06,
"loss": 0.4654,
"step": 3190
},
{
"epoch": 0.8179959100204499,
"grad_norm": 20.30816650390625,
"learning_rate": 9.87748825346664e-06,
"loss": 0.5197,
"step": 3200
},
{
"epoch": 0.8205521472392638,
"grad_norm": 10.663908004760742,
"learning_rate": 9.875517350550601e-06,
"loss": 0.6027,
"step": 3210
},
{
"epoch": 0.8231083844580777,
"grad_norm": 7.101048469543457,
"learning_rate": 9.873530920274761e-06,
"loss": 0.5027,
"step": 3220
},
{
"epoch": 0.8256646216768916,
"grad_norm": 16.21637725830078,
"learning_rate": 9.871528968965426e-06,
"loss": 0.6488,
"step": 3230
},
{
"epoch": 0.8282208588957055,
"grad_norm": 11.160218238830566,
"learning_rate": 9.86951150299833e-06,
"loss": 0.6848,
"step": 3240
},
{
"epoch": 0.8307770961145194,
"grad_norm": 7.589058876037598,
"learning_rate": 9.867478528798625e-06,
"loss": 0.3006,
"step": 3250
},
{
"epoch": 0.8333333333333334,
"grad_norm": 13.101618766784668,
"learning_rate": 9.865430052840849e-06,
"loss": 0.6459,
"step": 3260
},
{
"epoch": 0.8358895705521472,
"grad_norm": 6.775156021118164,
"learning_rate": 9.863366081648907e-06,
"loss": 0.5887,
"step": 3270
},
{
"epoch": 0.8384458077709611,
"grad_norm": 14.762919425964355,
"learning_rate": 9.861286621796056e-06,
"loss": 0.4892,
"step": 3280
},
{
"epoch": 0.841002044989775,
"grad_norm": 22.660533905029297,
"learning_rate": 9.85919167990488e-06,
"loss": 0.5762,
"step": 3290
},
{
"epoch": 0.843558282208589,
"grad_norm": 12.753227233886719,
"learning_rate": 9.857081262647269e-06,
"loss": 0.6596,
"step": 3300
},
{
"epoch": 0.8461145194274029,
"grad_norm": 14.134135246276855,
"learning_rate": 9.854955376744397e-06,
"loss": 0.5865,
"step": 3310
},
{
"epoch": 0.8486707566462167,
"grad_norm": 7.306004047393799,
"learning_rate": 9.852814028966706e-06,
"loss": 0.5196,
"step": 3320
},
{
"epoch": 0.8512269938650306,
"grad_norm": 12.3103609085083,
"learning_rate": 9.850657226133878e-06,
"loss": 0.605,
"step": 3330
},
{
"epoch": 0.8537832310838446,
"grad_norm": 7.823228359222412,
"learning_rate": 9.848484975114812e-06,
"loss": 0.6368,
"step": 3340
},
{
"epoch": 0.8563394683026585,
"grad_norm": 11.120277404785156,
"learning_rate": 9.846297282827612e-06,
"loss": 0.4841,
"step": 3350
},
{
"epoch": 0.8588957055214724,
"grad_norm": 8.988906860351562,
"learning_rate": 9.844094156239557e-06,
"loss": 0.5918,
"step": 3360
},
{
"epoch": 0.8614519427402862,
"grad_norm": 14.820247650146484,
"learning_rate": 9.841875602367079e-06,
"loss": 0.4307,
"step": 3370
},
{
"epoch": 0.8640081799591002,
"grad_norm": 7.334587097167969,
"learning_rate": 9.83964162827574e-06,
"loss": 0.564,
"step": 3380
},
{
"epoch": 0.8665644171779141,
"grad_norm": 11.864500999450684,
"learning_rate": 9.837392241080218e-06,
"loss": 0.5235,
"step": 3390
},
{
"epoch": 0.869120654396728,
"grad_norm": 10.920977592468262,
"learning_rate": 9.835127447944274e-06,
"loss": 0.4475,
"step": 3400
},
{
"epoch": 0.871676891615542,
"grad_norm": 8.427702903747559,
"learning_rate": 9.832847256080734e-06,
"loss": 0.5594,
"step": 3410
},
{
"epoch": 0.8742331288343558,
"grad_norm": 9.778414726257324,
"learning_rate": 9.830551672751463e-06,
"loss": 0.6194,
"step": 3420
},
{
"epoch": 0.8767893660531697,
"grad_norm": 8.027331352233887,
"learning_rate": 9.82824070526735e-06,
"loss": 0.5957,
"step": 3430
},
{
"epoch": 0.8793456032719836,
"grad_norm": 6.331071376800537,
"learning_rate": 9.825914360988271e-06,
"loss": 0.5145,
"step": 3440
},
{
"epoch": 0.8819018404907976,
"grad_norm": 8.607481956481934,
"learning_rate": 9.82357264732308e-06,
"loss": 0.5986,
"step": 3450
},
{
"epoch": 0.8844580777096115,
"grad_norm": 6.551468849182129,
"learning_rate": 9.821215571729578e-06,
"loss": 0.5461,
"step": 3460
},
{
"epoch": 0.8870143149284253,
"grad_norm": 6.835443496704102,
"learning_rate": 9.818843141714486e-06,
"loss": 0.7021,
"step": 3470
},
{
"epoch": 0.8895705521472392,
"grad_norm": 7.249754428863525,
"learning_rate": 9.81645536483343e-06,
"loss": 0.6188,
"step": 3480
},
{
"epoch": 0.8921267893660532,
"grad_norm": 7.487998962402344,
"learning_rate": 9.814052248690906e-06,
"loss": 0.4203,
"step": 3490
},
{
"epoch": 0.8946830265848671,
"grad_norm": 17.97199821472168,
"learning_rate": 9.81163380094027e-06,
"loss": 0.4725,
"step": 3500
},
{
"epoch": 0.897239263803681,
"grad_norm": 15.719616889953613,
"learning_rate": 9.809200029283698e-06,
"loss": 0.5723,
"step": 3510
},
{
"epoch": 0.8997955010224948,
"grad_norm": 9.500740051269531,
"learning_rate": 9.806750941472175e-06,
"loss": 0.417,
"step": 3520
},
{
"epoch": 0.9023517382413088,
"grad_norm": 7.425899505615234,
"learning_rate": 9.804286545305456e-06,
"loss": 0.4884,
"step": 3530
},
{
"epoch": 0.9049079754601227,
"grad_norm": 8.523987770080566,
"learning_rate": 9.801806848632062e-06,
"loss": 0.4925,
"step": 3540
},
{
"epoch": 0.9074642126789366,
"grad_norm": 13.769088745117188,
"learning_rate": 9.799311859349235e-06,
"loss": 0.3849,
"step": 3550
},
{
"epoch": 0.9100204498977505,
"grad_norm": 7.716251850128174,
"learning_rate": 9.796801585402913e-06,
"loss": 0.4594,
"step": 3560
},
{
"epoch": 0.9125766871165644,
"grad_norm": 10.922795295715332,
"learning_rate": 9.79427603478773e-06,
"loss": 0.4632,
"step": 3570
},
{
"epoch": 0.9151329243353783,
"grad_norm": 8.93303108215332,
"learning_rate": 9.791735215546953e-06,
"loss": 0.453,
"step": 3580
},
{
"epoch": 0.9176891615541922,
"grad_norm": 6.447891712188721,
"learning_rate": 9.78917913577249e-06,
"loss": 0.3284,
"step": 3590
},
{
"epoch": 0.9202453987730062,
"grad_norm": 8.590970993041992,
"learning_rate": 9.786607803604844e-06,
"loss": 0.4445,
"step": 3600
},
{
"epoch": 0.9228016359918201,
"grad_norm": 9.189178466796875,
"learning_rate": 9.784021227233097e-06,
"loss": 0.5768,
"step": 3610
},
{
"epoch": 0.9253578732106339,
"grad_norm": 8.67251968383789,
"learning_rate": 9.781419414894877e-06,
"loss": 0.4507,
"step": 3620
},
{
"epoch": 0.9279141104294478,
"grad_norm": 10.756339073181152,
"learning_rate": 9.778802374876332e-06,
"loss": 0.4278,
"step": 3630
},
{
"epoch": 0.9304703476482618,
"grad_norm": 9.680365562438965,
"learning_rate": 9.776170115512115e-06,
"loss": 0.3831,
"step": 3640
},
{
"epoch": 0.9330265848670757,
"grad_norm": 16.632375717163086,
"learning_rate": 9.773522645185342e-06,
"loss": 0.5033,
"step": 3650
},
{
"epoch": 0.9355828220858896,
"grad_norm": 7.6330695152282715,
"learning_rate": 9.770859972327575e-06,
"loss": 0.3978,
"step": 3660
},
{
"epoch": 0.9381390593047034,
"grad_norm": 8.260819435119629,
"learning_rate": 9.768182105418791e-06,
"loss": 0.5457,
"step": 3670
},
{
"epoch": 0.9406952965235174,
"grad_norm": 18.994287490844727,
"learning_rate": 9.765489052987357e-06,
"loss": 0.5469,
"step": 3680
},
{
"epoch": 0.9432515337423313,
"grad_norm": 8.636393547058105,
"learning_rate": 9.762780823610006e-06,
"loss": 0.4657,
"step": 3690
},
{
"epoch": 0.9458077709611452,
"grad_norm": 16.197158813476562,
"learning_rate": 9.760057425911797e-06,
"loss": 0.3715,
"step": 3700
},
{
"epoch": 0.9483640081799591,
"grad_norm": 28.646278381347656,
"learning_rate": 9.757318868566107e-06,
"loss": 0.3147,
"step": 3710
},
{
"epoch": 0.950920245398773,
"grad_norm": 9.230977058410645,
"learning_rate": 9.754565160294587e-06,
"loss": 0.6337,
"step": 3720
},
{
"epoch": 0.9534764826175869,
"grad_norm": 17.38115882873535,
"learning_rate": 9.751796309867139e-06,
"loss": 0.4393,
"step": 3730
},
{
"epoch": 0.9560327198364008,
"grad_norm": 15.209970474243164,
"learning_rate": 9.749012326101891e-06,
"loss": 0.4759,
"step": 3740
},
{
"epoch": 0.9585889570552147,
"grad_norm": 15.37113094329834,
"learning_rate": 9.74621321786517e-06,
"loss": 0.493,
"step": 3750
},
{
"epoch": 0.9611451942740287,
"grad_norm": 9.076826095581055,
"learning_rate": 9.743398994071467e-06,
"loss": 0.2903,
"step": 3760
},
{
"epoch": 0.9637014314928425,
"grad_norm": 6.899563312530518,
"learning_rate": 9.740569663683413e-06,
"loss": 0.3847,
"step": 3770
},
{
"epoch": 0.9662576687116564,
"grad_norm": 14.622838973999023,
"learning_rate": 9.73772523571175e-06,
"loss": 0.3528,
"step": 3780
},
{
"epoch": 0.9688139059304703,
"grad_norm": 11.762303352355957,
"learning_rate": 9.734865719215303e-06,
"loss": 0.4437,
"step": 3790
},
{
"epoch": 0.9713701431492843,
"grad_norm": 11.108593940734863,
"learning_rate": 9.73199112330095e-06,
"loss": 0.2947,
"step": 3800
},
{
"epoch": 0.9739263803680982,
"grad_norm": 7.895074367523193,
"learning_rate": 9.729101457123593e-06,
"loss": 0.4659,
"step": 3810
},
{
"epoch": 0.976482617586912,
"grad_norm": 10.534423828125,
"learning_rate": 9.72619672988613e-06,
"loss": 0.5034,
"step": 3820
},
{
"epoch": 0.9790388548057259,
"grad_norm": 6.145469665527344,
"learning_rate": 9.723276950839425e-06,
"loss": 0.4708,
"step": 3830
},
{
"epoch": 0.9815950920245399,
"grad_norm": 5.333863258361816,
"learning_rate": 9.720342129282277e-06,
"loss": 0.5987,
"step": 3840
},
{
"epoch": 0.9841513292433538,
"grad_norm": 11.559300422668457,
"learning_rate": 9.717392274561392e-06,
"loss": 0.5316,
"step": 3850
},
{
"epoch": 0.9867075664621677,
"grad_norm": 7.202635288238525,
"learning_rate": 9.714427396071354e-06,
"loss": 0.3995,
"step": 3860
},
{
"epoch": 0.9892638036809815,
"grad_norm": 9.292013168334961,
"learning_rate": 9.711447503254595e-06,
"loss": 0.5362,
"step": 3870
},
{
"epoch": 0.9918200408997955,
"grad_norm": 15.875975608825684,
"learning_rate": 9.708452605601361e-06,
"loss": 0.3956,
"step": 3880
},
{
"epoch": 0.9943762781186094,
"grad_norm": 5.166224002838135,
"learning_rate": 9.705442712649688e-06,
"loss": 0.4298,
"step": 3890
},
{
"epoch": 0.9969325153374233,
"grad_norm": 28.647296905517578,
"learning_rate": 9.702417833985367e-06,
"loss": 0.5758,
"step": 3900
},
{
"epoch": 0.9994887525562373,
"grad_norm": 7.455996990203857,
"learning_rate": 9.699377979241915e-06,
"loss": 0.4445,
"step": 3910
},
{
"epoch": 1.0020449897750512,
"grad_norm": 8.313132286071777,
"learning_rate": 9.696323158100543e-06,
"loss": 0.3661,
"step": 3920
},
{
"epoch": 1.0046012269938651,
"grad_norm": 2.6401190757751465,
"learning_rate": 9.69325338029013e-06,
"loss": 0.4446,
"step": 3930
},
{
"epoch": 1.007157464212679,
"grad_norm": 8.16818904876709,
"learning_rate": 9.690168655587184e-06,
"loss": 0.298,
"step": 3940
},
{
"epoch": 1.0097137014314927,
"grad_norm": 9.28429889678955,
"learning_rate": 9.687068993815819e-06,
"loss": 0.2262,
"step": 3950
},
{
"epoch": 1.0122699386503067,
"grad_norm": 6.392743110656738,
"learning_rate": 9.683954404847715e-06,
"loss": 0.2432,
"step": 3960
},
{
"epoch": 1.0148261758691206,
"grad_norm": 6.890766620635986,
"learning_rate": 9.6808248986021e-06,
"loss": 0.4461,
"step": 3970
},
{
"epoch": 1.0173824130879345,
"grad_norm": 10.436578750610352,
"learning_rate": 9.6776804850457e-06,
"loss": 0.3529,
"step": 3980
},
{
"epoch": 1.0199386503067485,
"grad_norm": 7.264800071716309,
"learning_rate": 9.674521174192726e-06,
"loss": 0.2966,
"step": 3990
},
{
"epoch": 1.0224948875255624,
"grad_norm": 10.522168159484863,
"learning_rate": 9.671346976104828e-06,
"loss": 0.2953,
"step": 4000
},
{
"epoch": 1.0250511247443763,
"grad_norm": 5.65585994720459,
"learning_rate": 9.668157900891069e-06,
"loss": 0.3308,
"step": 4010
},
{
"epoch": 1.0276073619631902,
"grad_norm": 9.439372062683105,
"learning_rate": 9.664953958707892e-06,
"loss": 0.2545,
"step": 4020
},
{
"epoch": 1.0301635991820042,
"grad_norm": 3.5625405311584473,
"learning_rate": 9.661735159759093e-06,
"loss": 0.2846,
"step": 4030
},
{
"epoch": 1.032719836400818,
"grad_norm": 26.94212532043457,
"learning_rate": 9.658501514295775e-06,
"loss": 0.205,
"step": 4040
},
{
"epoch": 1.0352760736196318,
"grad_norm": 11.873112678527832,
"learning_rate": 9.655253032616327e-06,
"loss": 0.3401,
"step": 4050
},
{
"epoch": 1.0378323108384457,
"grad_norm": 7.584825038909912,
"learning_rate": 9.651989725066393e-06,
"loss": 0.2991,
"step": 4060
},
{
"epoch": 1.0403885480572597,
"grad_norm": 7.558630466461182,
"learning_rate": 9.648711602038823e-06,
"loss": 0.3096,
"step": 4070
},
{
"epoch": 1.0429447852760736,
"grad_norm": 24.522443771362305,
"learning_rate": 9.64541867397366e-06,
"loss": 0.4115,
"step": 4080
},
{
"epoch": 1.0455010224948875,
"grad_norm": 5.4436354637146,
"learning_rate": 9.642110951358097e-06,
"loss": 0.2687,
"step": 4090
},
{
"epoch": 1.0480572597137015,
"grad_norm": 9.708597183227539,
"learning_rate": 9.638788444726437e-06,
"loss": 0.2038,
"step": 4100
},
{
"epoch": 1.0506134969325154,
"grad_norm": 5.303321361541748,
"learning_rate": 9.635451164660073e-06,
"loss": 0.3039,
"step": 4110
},
{
"epoch": 1.0531697341513293,
"grad_norm": 7.557952404022217,
"learning_rate": 9.632099121787445e-06,
"loss": 0.3325,
"step": 4120
},
{
"epoch": 1.0557259713701432,
"grad_norm": 5.638031005859375,
"learning_rate": 9.628732326784014e-06,
"loss": 0.3189,
"step": 4130
},
{
"epoch": 1.058282208588957,
"grad_norm": 1.7007097005844116,
"learning_rate": 9.625350790372214e-06,
"loss": 0.3178,
"step": 4140
},
{
"epoch": 1.0608384458077709,
"grad_norm": 8.193168640136719,
"learning_rate": 9.621954523321434e-06,
"loss": 0.307,
"step": 4150
},
{
"epoch": 1.0633946830265848,
"grad_norm": 15.883909225463867,
"learning_rate": 9.618543536447974e-06,
"loss": 0.2642,
"step": 4160
},
{
"epoch": 1.0659509202453987,
"grad_norm": 13.922346115112305,
"learning_rate": 9.615117840615011e-06,
"loss": 0.3466,
"step": 4170
},
{
"epoch": 1.0685071574642127,
"grad_norm": 21.666532516479492,
"learning_rate": 9.611677446732576e-06,
"loss": 0.2475,
"step": 4180
},
{
"epoch": 1.0710633946830266,
"grad_norm": 14.09211540222168,
"learning_rate": 9.608222365757498e-06,
"loss": 0.2698,
"step": 4190
},
{
"epoch": 1.0736196319018405,
"grad_norm": 9.652295112609863,
"learning_rate": 9.604752608693384e-06,
"loss": 0.2477,
"step": 4200
},
{
"epoch": 1.0761758691206544,
"grad_norm": 5.439416408538818,
"learning_rate": 9.601268186590587e-06,
"loss": 0.2024,
"step": 4210
},
{
"epoch": 1.0787321063394684,
"grad_norm": 3.458691358566284,
"learning_rate": 9.597769110546158e-06,
"loss": 0.2974,
"step": 4220
},
{
"epoch": 1.0812883435582823,
"grad_norm": 8.662911415100098,
"learning_rate": 9.594255391703821e-06,
"loss": 0.2053,
"step": 4230
},
{
"epoch": 1.0838445807770962,
"grad_norm": 9.305736541748047,
"learning_rate": 9.59072704125393e-06,
"loss": 0.3785,
"step": 4240
},
{
"epoch": 1.08640081799591,
"grad_norm": 8.057384490966797,
"learning_rate": 9.587184070433442e-06,
"loss": 0.239,
"step": 4250
},
{
"epoch": 1.0889570552147239,
"grad_norm": 11.628586769104004,
"learning_rate": 9.583626490525872e-06,
"loss": 0.3451,
"step": 4260
},
{
"epoch": 1.0915132924335378,
"grad_norm": 5.124874591827393,
"learning_rate": 9.580054312861264e-06,
"loss": 0.3267,
"step": 4270
},
{
"epoch": 1.0940695296523517,
"grad_norm": 8.520767211914062,
"learning_rate": 9.576467548816154e-06,
"loss": 0.2843,
"step": 4280
},
{
"epoch": 1.0966257668711656,
"grad_norm": 13.09350872039795,
"learning_rate": 9.572866209813525e-06,
"loss": 0.2522,
"step": 4290
},
{
"epoch": 1.0991820040899796,
"grad_norm": 6.647915840148926,
"learning_rate": 9.569250307322788e-06,
"loss": 0.3104,
"step": 4300
},
{
"epoch": 1.1017382413087935,
"grad_norm": 10.310320854187012,
"learning_rate": 9.565619852859727e-06,
"loss": 0.2137,
"step": 4310
},
{
"epoch": 1.1042944785276074,
"grad_norm": 6.362160682678223,
"learning_rate": 9.561974857986472e-06,
"loss": 0.1824,
"step": 4320
},
{
"epoch": 1.1068507157464214,
"grad_norm": 16.666887283325195,
"learning_rate": 9.558315334311467e-06,
"loss": 0.3631,
"step": 4330
},
{
"epoch": 1.109406952965235,
"grad_norm": 2.7935502529144287,
"learning_rate": 9.554641293489419e-06,
"loss": 0.2915,
"step": 4340
},
{
"epoch": 1.111963190184049,
"grad_norm": 15.494998931884766,
"learning_rate": 9.55095274722127e-06,
"loss": 0.2922,
"step": 4350
},
{
"epoch": 1.114519427402863,
"grad_norm": 6.94740629196167,
"learning_rate": 9.547249707254166e-06,
"loss": 0.264,
"step": 4360
},
{
"epoch": 1.1170756646216768,
"grad_norm": 7.18923807144165,
"learning_rate": 9.543532185381397e-06,
"loss": 0.3097,
"step": 4370
},
{
"epoch": 1.1196319018404908,
"grad_norm": 10.083481788635254,
"learning_rate": 9.53980019344239e-06,
"loss": 0.2706,
"step": 4380
},
{
"epoch": 1.1221881390593047,
"grad_norm": 7.783493995666504,
"learning_rate": 9.53605374332265e-06,
"loss": 0.1824,
"step": 4390
},
{
"epoch": 1.1247443762781186,
"grad_norm": 10.747809410095215,
"learning_rate": 9.532292846953723e-06,
"loss": 0.3375,
"step": 4400
},
{
"epoch": 1.1273006134969326,
"grad_norm": 11.694700241088867,
"learning_rate": 9.528517516313167e-06,
"loss": 0.2018,
"step": 4410
},
{
"epoch": 1.1298568507157465,
"grad_norm": 6.256073474884033,
"learning_rate": 9.524727763424513e-06,
"loss": 0.1545,
"step": 4420
},
{
"epoch": 1.1324130879345604,
"grad_norm": 6.233736991882324,
"learning_rate": 9.520923600357217e-06,
"loss": 0.2827,
"step": 4430
},
{
"epoch": 1.1349693251533743,
"grad_norm": 8.213584899902344,
"learning_rate": 9.517105039226632e-06,
"loss": 0.315,
"step": 4440
},
{
"epoch": 1.137525562372188,
"grad_norm": 12.951038360595703,
"learning_rate": 9.513272092193965e-06,
"loss": 0.2061,
"step": 4450
},
{
"epoch": 1.140081799591002,
"grad_norm": 5.706482410430908,
"learning_rate": 9.509424771466236e-06,
"loss": 0.2526,
"step": 4460
},
{
"epoch": 1.142638036809816,
"grad_norm": 6.124299049377441,
"learning_rate": 9.505563089296246e-06,
"loss": 0.3302,
"step": 4470
},
{
"epoch": 1.1451942740286298,
"grad_norm": 11.08293342590332,
"learning_rate": 9.501687057982531e-06,
"loss": 0.2411,
"step": 4480
},
{
"epoch": 1.1477505112474438,
"grad_norm": 8.393287658691406,
"learning_rate": 9.497796689869324e-06,
"loss": 0.3682,
"step": 4490
},
{
"epoch": 1.1503067484662577,
"grad_norm": 0.49787667393684387,
"learning_rate": 9.493891997346522e-06,
"loss": 0.176,
"step": 4500
},
{
"epoch": 1.1528629856850716,
"grad_norm": 6.434317588806152,
"learning_rate": 9.489972992849641e-06,
"loss": 0.2696,
"step": 4510
},
{
"epoch": 1.1554192229038855,
"grad_norm": 8.729398727416992,
"learning_rate": 9.486039688859772e-06,
"loss": 0.2838,
"step": 4520
},
{
"epoch": 1.1579754601226995,
"grad_norm": 9.446803092956543,
"learning_rate": 9.482092097903551e-06,
"loss": 0.3253,
"step": 4530
},
{
"epoch": 1.1605316973415132,
"grad_norm": 6.4901957511901855,
"learning_rate": 9.478130232553111e-06,
"loss": 0.3429,
"step": 4540
},
{
"epoch": 1.163087934560327,
"grad_norm": 9.026398658752441,
"learning_rate": 9.474154105426055e-06,
"loss": 0.3302,
"step": 4550
},
{
"epoch": 1.165644171779141,
"grad_norm": 6.108066082000732,
"learning_rate": 9.470163729185392e-06,
"loss": 0.1702,
"step": 4560
},
{
"epoch": 1.168200408997955,
"grad_norm": 10.425956726074219,
"learning_rate": 9.466159116539523e-06,
"loss": 0.3008,
"step": 4570
},
{
"epoch": 1.170756646216769,
"grad_norm": 4.817817211151123,
"learning_rate": 9.462140280242182e-06,
"loss": 0.3151,
"step": 4580
},
{
"epoch": 1.1733128834355828,
"grad_norm": 17.882158279418945,
"learning_rate": 9.458107233092406e-06,
"loss": 0.23,
"step": 4590
},
{
"epoch": 1.1758691206543967,
"grad_norm": 5.028483867645264,
"learning_rate": 9.454059987934487e-06,
"loss": 0.2413,
"step": 4600
},
{
"epoch": 1.1784253578732107,
"grad_norm": 9.872651100158691,
"learning_rate": 9.449998557657936e-06,
"loss": 0.1329,
"step": 4610
},
{
"epoch": 1.1809815950920246,
"grad_norm": 5.998063087463379,
"learning_rate": 9.445922955197437e-06,
"loss": 0.2879,
"step": 4620
},
{
"epoch": 1.1835378323108385,
"grad_norm": 8.390649795532227,
"learning_rate": 9.441833193532817e-06,
"loss": 0.2824,
"step": 4630
},
{
"epoch": 1.1860940695296525,
"grad_norm": 6.652390003204346,
"learning_rate": 9.437729285688986e-06,
"loss": 0.3389,
"step": 4640
},
{
"epoch": 1.1886503067484662,
"grad_norm": 10.573369026184082,
"learning_rate": 9.433611244735914e-06,
"loss": 0.3841,
"step": 4650
},
{
"epoch": 1.19120654396728,
"grad_norm": 10.0396146774292,
"learning_rate": 9.429479083788578e-06,
"loss": 0.2638,
"step": 4660
},
{
"epoch": 1.193762781186094,
"grad_norm": 11.902812004089355,
"learning_rate": 9.425332816006927e-06,
"loss": 0.4186,
"step": 4670
},
{
"epoch": 1.196319018404908,
"grad_norm": 10.162897109985352,
"learning_rate": 9.421172454595834e-06,
"loss": 0.3057,
"step": 4680
},
{
"epoch": 1.1988752556237219,
"grad_norm": 11.278912544250488,
"learning_rate": 9.416998012805057e-06,
"loss": 0.3223,
"step": 4690
},
{
"epoch": 1.2014314928425358,
"grad_norm": 8.295330047607422,
"learning_rate": 9.412809503929198e-06,
"loss": 0.2588,
"step": 4700
},
{
"epoch": 1.2039877300613497,
"grad_norm": 7.55431604385376,
"learning_rate": 9.408606941307658e-06,
"loss": 0.3087,
"step": 4710
},
{
"epoch": 1.2065439672801637,
"grad_norm": 3.9323720932006836,
"learning_rate": 9.404390338324599e-06,
"loss": 0.3091,
"step": 4720
},
{
"epoch": 1.2091002044989776,
"grad_norm": 7.560153007507324,
"learning_rate": 9.400159708408892e-06,
"loss": 0.2096,
"step": 4730
},
{
"epoch": 1.2116564417177913,
"grad_norm": 9.517462730407715,
"learning_rate": 9.395915065034085e-06,
"loss": 0.1582,
"step": 4740
},
{
"epoch": 1.2142126789366052,
"grad_norm": 5.7381720542907715,
"learning_rate": 9.391656421718356e-06,
"loss": 0.1742,
"step": 4750
},
{
"epoch": 1.2167689161554192,
"grad_norm": 7.014863014221191,
"learning_rate": 9.387383792024469e-06,
"loss": 0.2988,
"step": 4760
},
{
"epoch": 1.219325153374233,
"grad_norm": 12.077631950378418,
"learning_rate": 9.383097189559728e-06,
"loss": 0.254,
"step": 4770
},
{
"epoch": 1.221881390593047,
"grad_norm": 8.781020164489746,
"learning_rate": 9.37879662797594e-06,
"loss": 0.2946,
"step": 4780
},
{
"epoch": 1.224437627811861,
"grad_norm": 9.89029312133789,
"learning_rate": 9.37448212096937e-06,
"loss": 0.2043,
"step": 4790
},
{
"epoch": 1.2269938650306749,
"grad_norm": 7.694300174713135,
"learning_rate": 9.370153682280692e-06,
"loss": 0.138,
"step": 4800
},
{
"epoch": 1.2295501022494888,
"grad_norm": 8.310929298400879,
"learning_rate": 9.365811325694949e-06,
"loss": 0.2311,
"step": 4810
},
{
"epoch": 1.2321063394683027,
"grad_norm": 12.575085639953613,
"learning_rate": 9.361455065041514e-06,
"loss": 0.2834,
"step": 4820
},
{
"epoch": 1.2346625766871167,
"grad_norm": 10.732074737548828,
"learning_rate": 9.357084914194036e-06,
"loss": 0.2134,
"step": 4830
},
{
"epoch": 1.2372188139059306,
"grad_norm": 10.34244441986084,
"learning_rate": 9.352700887070403e-06,
"loss": 0.3486,
"step": 4840
},
{
"epoch": 1.2397750511247443,
"grad_norm": 10.497349739074707,
"learning_rate": 9.348302997632699e-06,
"loss": 0.3058,
"step": 4850
},
{
"epoch": 1.2423312883435582,
"grad_norm": 14.589156150817871,
"learning_rate": 9.343891259887148e-06,
"loss": 0.2331,
"step": 4860
},
{
"epoch": 1.2448875255623721,
"grad_norm": 5.382908344268799,
"learning_rate": 9.339465687884086e-06,
"loss": 0.3091,
"step": 4870
},
{
"epoch": 1.247443762781186,
"grad_norm": 16.56047821044922,
"learning_rate": 9.335026295717902e-06,
"loss": 0.2812,
"step": 4880
},
{
"epoch": 1.25,
"grad_norm": 5.166291236877441,
"learning_rate": 9.330573097527002e-06,
"loss": 0.2357,
"step": 4890
},
{
"epoch": 1.252556237218814,
"grad_norm": 6.794707775115967,
"learning_rate": 9.326106107493762e-06,
"loss": 0.2503,
"step": 4900
},
{
"epoch": 1.2551124744376279,
"grad_norm": 6.429582118988037,
"learning_rate": 9.321625339844476e-06,
"loss": 0.1967,
"step": 4910
},
{
"epoch": 1.2576687116564418,
"grad_norm": 21.49854278564453,
"learning_rate": 9.317130808849322e-06,
"loss": 0.3339,
"step": 4920
},
{
"epoch": 1.2602249488752557,
"grad_norm": 6.054262161254883,
"learning_rate": 9.312622528822308e-06,
"loss": 0.1903,
"step": 4930
},
{
"epoch": 1.2627811860940694,
"grad_norm": 13.686524391174316,
"learning_rate": 9.308100514121233e-06,
"loss": 0.1497,
"step": 4940
},
{
"epoch": 1.2653374233128836,
"grad_norm": 18.514162063598633,
"learning_rate": 9.303564779147634e-06,
"loss": 0.2372,
"step": 4950
},
{
"epoch": 1.2678936605316973,
"grad_norm": 6.550439357757568,
"learning_rate": 9.299015338346745e-06,
"loss": 0.2101,
"step": 4960
},
{
"epoch": 1.2704498977505112,
"grad_norm": 9.836435317993164,
"learning_rate": 9.294452206207448e-06,
"loss": 0.1643,
"step": 4970
},
{
"epoch": 1.2730061349693251,
"grad_norm": 7.0567307472229,
"learning_rate": 9.289875397262234e-06,
"loss": 0.1969,
"step": 4980
},
{
"epoch": 1.275562372188139,
"grad_norm": 8.437677383422852,
"learning_rate": 9.285284926087144e-06,
"loss": 0.3502,
"step": 4990
},
{
"epoch": 1.278118609406953,
"grad_norm": 7.982880592346191,
"learning_rate": 9.280680807301735e-06,
"loss": 0.1473,
"step": 5000
},
{
"epoch": 1.280674846625767,
"grad_norm": 6.814586162567139,
"learning_rate": 9.276063055569029e-06,
"loss": 0.2684,
"step": 5010
},
{
"epoch": 1.2832310838445808,
"grad_norm": 5.944293022155762,
"learning_rate": 9.271431685595461e-06,
"loss": 0.1763,
"step": 5020
},
{
"epoch": 1.2857873210633946,
"grad_norm": 5.889406204223633,
"learning_rate": 9.266786712130842e-06,
"loss": 0.1852,
"step": 5030
},
{
"epoch": 1.2883435582822087,
"grad_norm": 5.56532096862793,
"learning_rate": 9.262128149968304e-06,
"loss": 0.3474,
"step": 5040
},
{
"epoch": 1.2908997955010224,
"grad_norm": 6.4994049072265625,
"learning_rate": 9.257456013944255e-06,
"loss": 0.1804,
"step": 5050
},
{
"epoch": 1.2934560327198363,
"grad_norm": 6.235182285308838,
"learning_rate": 9.252770318938334e-06,
"loss": 0.2414,
"step": 5060
},
{
"epoch": 1.2960122699386503,
"grad_norm": 5.915652275085449,
"learning_rate": 9.248071079873362e-06,
"loss": 0.2333,
"step": 5070
},
{
"epoch": 1.2985685071574642,
"grad_norm": 9.032744407653809,
"learning_rate": 9.243358311715298e-06,
"loss": 0.2185,
"step": 5080
},
{
"epoch": 1.3011247443762781,
"grad_norm": 7.362344264984131,
"learning_rate": 9.238632029473178e-06,
"loss": 0.2571,
"step": 5090
},
{
"epoch": 1.303680981595092,
"grad_norm": 9.257672309875488,
"learning_rate": 9.23389224819909e-06,
"loss": 0.2363,
"step": 5100
},
{
"epoch": 1.306237218813906,
"grad_norm": 8.25611400604248,
"learning_rate": 9.229138982988102e-06,
"loss": 0.1432,
"step": 5110
},
{
"epoch": 1.30879345603272,
"grad_norm": 9.176118850708008,
"learning_rate": 9.224372248978231e-06,
"loss": 0.2158,
"step": 5120
},
{
"epoch": 1.3113496932515338,
"grad_norm": 3.796792984008789,
"learning_rate": 9.21959206135039e-06,
"loss": 0.1544,
"step": 5130
},
{
"epoch": 1.3139059304703475,
"grad_norm": 6.011196613311768,
"learning_rate": 9.214798435328334e-06,
"loss": 0.3326,
"step": 5140
},
{
"epoch": 1.3164621676891617,
"grad_norm": 16.793350219726562,
"learning_rate": 9.209991386178621e-06,
"loss": 0.2056,
"step": 5150
},
{
"epoch": 1.3190184049079754,
"grad_norm": 7.064115047454834,
"learning_rate": 9.205170929210552e-06,
"loss": 0.3113,
"step": 5160
},
{
"epoch": 1.3215746421267893,
"grad_norm": 19.5340518951416,
"learning_rate": 9.200337079776136e-06,
"loss": 0.1886,
"step": 5170
},
{
"epoch": 1.3241308793456033,
"grad_norm": 12.674887657165527,
"learning_rate": 9.195489853270029e-06,
"loss": 0.4599,
"step": 5180
},
{
"epoch": 1.3266871165644172,
"grad_norm": 13.094590187072754,
"learning_rate": 9.190629265129492e-06,
"loss": 0.2936,
"step": 5190
},
{
"epoch": 1.329243353783231,
"grad_norm": 9.762693405151367,
"learning_rate": 9.185755330834338e-06,
"loss": 0.2078,
"step": 5200
},
{
"epoch": 1.331799591002045,
"grad_norm": 7.909463405609131,
"learning_rate": 9.180868065906884e-06,
"loss": 0.2288,
"step": 5210
},
{
"epoch": 1.334355828220859,
"grad_norm": 7.411076545715332,
"learning_rate": 9.175967485911907e-06,
"loss": 0.2717,
"step": 5220
},
{
"epoch": 1.3369120654396727,
"grad_norm": 6.424882411956787,
"learning_rate": 9.171053606456582e-06,
"loss": 0.1745,
"step": 5230
},
{
"epoch": 1.3394683026584868,
"grad_norm": 6.506113052368164,
"learning_rate": 9.166126443190443e-06,
"loss": 0.1601,
"step": 5240
},
{
"epoch": 1.3420245398773005,
"grad_norm": 9.06916332244873,
"learning_rate": 9.161186011805332e-06,
"loss": 0.3146,
"step": 5250
},
{
"epoch": 1.3445807770961145,
"grad_norm": 10.523892402648926,
"learning_rate": 9.156232328035342e-06,
"loss": 0.2956,
"step": 5260
},
{
"epoch": 1.3471370143149284,
"grad_norm": 8.017621994018555,
"learning_rate": 9.151265407656775e-06,
"loss": 0.2294,
"step": 5270
},
{
"epoch": 1.3496932515337423,
"grad_norm": 14.679991722106934,
"learning_rate": 9.146285266488088e-06,
"loss": 0.2024,
"step": 5280
},
{
"epoch": 1.3522494887525562,
"grad_norm": 0.9324799180030823,
"learning_rate": 9.141291920389843e-06,
"loss": 0.1614,
"step": 5290
},
{
"epoch": 1.3548057259713702,
"grad_norm": 5.870517253875732,
"learning_rate": 9.136285385264655e-06,
"loss": 0.2225,
"step": 5300
},
{
"epoch": 1.357361963190184,
"grad_norm": 11.407279014587402,
"learning_rate": 9.131265677057146e-06,
"loss": 0.1872,
"step": 5310
},
{
"epoch": 1.359918200408998,
"grad_norm": 17.659618377685547,
"learning_rate": 9.12623281175389e-06,
"loss": 0.2171,
"step": 5320
},
{
"epoch": 1.362474437627812,
"grad_norm": 12.906618118286133,
"learning_rate": 9.121186805383358e-06,
"loss": 0.2759,
"step": 5330
},
{
"epoch": 1.3650306748466257,
"grad_norm": 6.954870223999023,
"learning_rate": 9.11612767401588e-06,
"loss": 0.2188,
"step": 5340
},
{
"epoch": 1.3675869120654398,
"grad_norm": 4.730753421783447,
"learning_rate": 9.111055433763582e-06,
"loss": 0.2126,
"step": 5350
},
{
"epoch": 1.3701431492842535,
"grad_norm": 13.265816688537598,
"learning_rate": 9.105970100780341e-06,
"loss": 0.2904,
"step": 5360
},
{
"epoch": 1.3726993865030674,
"grad_norm": 3.0092155933380127,
"learning_rate": 9.100871691261728e-06,
"loss": 0.1578,
"step": 5370
},
{
"epoch": 1.3752556237218814,
"grad_norm": 6.426031112670898,
"learning_rate": 9.09576022144496e-06,
"loss": 0.2037,
"step": 5380
},
{
"epoch": 1.3778118609406953,
"grad_norm": 8.25606918334961,
"learning_rate": 9.09063570760885e-06,
"loss": 0.1798,
"step": 5390
},
{
"epoch": 1.3803680981595092,
"grad_norm": 20.269100189208984,
"learning_rate": 9.085498166073755e-06,
"loss": 0.3306,
"step": 5400
},
{
"epoch": 1.3829243353783232,
"grad_norm": 7.950530529022217,
"learning_rate": 9.080347613201513e-06,
"loss": 0.2489,
"step": 5410
},
{
"epoch": 1.385480572597137,
"grad_norm": 11.141780853271484,
"learning_rate": 9.075184065395413e-06,
"loss": 0.2043,
"step": 5420
},
{
"epoch": 1.3880368098159508,
"grad_norm": 4.896001815795898,
"learning_rate": 9.070007539100118e-06,
"loss": 0.3356,
"step": 5430
},
{
"epoch": 1.390593047034765,
"grad_norm": 11.557963371276855,
"learning_rate": 9.064818050801634e-06,
"loss": 0.1741,
"step": 5440
},
{
"epoch": 1.3931492842535786,
"grad_norm": 6.800997734069824,
"learning_rate": 9.05961561702724e-06,
"loss": 0.1887,
"step": 5450
},
{
"epoch": 1.3957055214723926,
"grad_norm": 6.017879009246826,
"learning_rate": 9.054400254345448e-06,
"loss": 0.2398,
"step": 5460
},
{
"epoch": 1.3982617586912065,
"grad_norm": 6.6386189460754395,
"learning_rate": 9.049171979365945e-06,
"loss": 0.1465,
"step": 5470
},
{
"epoch": 1.4008179959100204,
"grad_norm": 4.621875762939453,
"learning_rate": 9.043930808739537e-06,
"loss": 0.335,
"step": 5480
},
{
"epoch": 1.4033742331288344,
"grad_norm": 6.274672508239746,
"learning_rate": 9.038676759158105e-06,
"loss": 0.1384,
"step": 5490
},
{
"epoch": 1.4059304703476483,
"grad_norm": 2.794377565383911,
"learning_rate": 9.033409847354542e-06,
"loss": 0.2304,
"step": 5500
},
{
"epoch": 1.4084867075664622,
"grad_norm": 10.634669303894043,
"learning_rate": 9.028130090102706e-06,
"loss": 0.3528,
"step": 5510
},
{
"epoch": 1.4110429447852761,
"grad_norm": 6.818256855010986,
"learning_rate": 9.022837504217366e-06,
"loss": 0.1227,
"step": 5520
},
{
"epoch": 1.41359918200409,
"grad_norm": 8.108813285827637,
"learning_rate": 9.017532106554143e-06,
"loss": 0.2864,
"step": 5530
},
{
"epoch": 1.4161554192229038,
"grad_norm": 8.222419738769531,
"learning_rate": 9.012213914009464e-06,
"loss": 0.251,
"step": 5540
},
{
"epoch": 1.418711656441718,
"grad_norm": 9.900671005249023,
"learning_rate": 9.006882943520506e-06,
"loss": 0.2974,
"step": 5550
},
{
"epoch": 1.4212678936605316,
"grad_norm": 4.816144943237305,
"learning_rate": 9.001539212065136e-06,
"loss": 0.2626,
"step": 5560
},
{
"epoch": 1.4238241308793456,
"grad_norm": 3.0924923419952393,
"learning_rate": 8.996182736661863e-06,
"loss": 0.1263,
"step": 5570
},
{
"epoch": 1.4263803680981595,
"grad_norm": 5.688522815704346,
"learning_rate": 8.990813534369787e-06,
"loss": 0.2336,
"step": 5580
},
{
"epoch": 1.4289366053169734,
"grad_norm": 10.940909385681152,
"learning_rate": 8.985431622288533e-06,
"loss": 0.2868,
"step": 5590
},
{
"epoch": 1.4314928425357873,
"grad_norm": 13.232209205627441,
"learning_rate": 8.98003701755821e-06,
"loss": 0.2469,
"step": 5600
},
{
"epoch": 1.4340490797546013,
"grad_norm": 7.461823463439941,
"learning_rate": 8.974629737359348e-06,
"loss": 0.2405,
"step": 5610
},
{
"epoch": 1.4366053169734152,
"grad_norm": 3.547605037689209,
"learning_rate": 8.96920979891284e-06,
"loss": 0.1996,
"step": 5620
},
{
"epoch": 1.439161554192229,
"grad_norm": 6.454622745513916,
"learning_rate": 8.963777219479902e-06,
"loss": 0.2072,
"step": 5630
},
{
"epoch": 1.441717791411043,
"grad_norm": 6.902385711669922,
"learning_rate": 8.958332016362e-06,
"loss": 0.0997,
"step": 5640
},
{
"epoch": 1.4442740286298568,
"grad_norm": 7.078310489654541,
"learning_rate": 8.952874206900809e-06,
"loss": 0.1943,
"step": 5650
},
{
"epoch": 1.4468302658486707,
"grad_norm": 5.974771976470947,
"learning_rate": 8.94740380847815e-06,
"loss": 0.3048,
"step": 5660
},
{
"epoch": 1.4493865030674846,
"grad_norm": 7.63726806640625,
"learning_rate": 8.941920838515936e-06,
"loss": 0.1593,
"step": 5670
},
{
"epoch": 1.4519427402862985,
"grad_norm": 5.1760430335998535,
"learning_rate": 8.936425314476121e-06,
"loss": 0.1877,
"step": 5680
},
{
"epoch": 1.4544989775051125,
"grad_norm": 8.131750106811523,
"learning_rate": 8.930917253860637e-06,
"loss": 0.2409,
"step": 5690
},
{
"epoch": 1.4570552147239264,
"grad_norm": 6.002188205718994,
"learning_rate": 8.925396674211341e-06,
"loss": 0.2159,
"step": 5700
},
{
"epoch": 1.4596114519427403,
"grad_norm": 12.237569808959961,
"learning_rate": 8.919863593109967e-06,
"loss": 0.2005,
"step": 5710
},
{
"epoch": 1.4621676891615543,
"grad_norm": 14.401376724243164,
"learning_rate": 8.914318028178055e-06,
"loss": 0.3153,
"step": 5720
},
{
"epoch": 1.4647239263803682,
"grad_norm": 5.81574821472168,
"learning_rate": 8.908759997076909e-06,
"loss": 0.1836,
"step": 5730
},
{
"epoch": 1.467280163599182,
"grad_norm": 6.657829761505127,
"learning_rate": 8.903189517507527e-06,
"loss": 0.2741,
"step": 5740
},
{
"epoch": 1.469836400817996,
"grad_norm": 4.597752094268799,
"learning_rate": 8.897606607210563e-06,
"loss": 0.1928,
"step": 5750
},
{
"epoch": 1.4723926380368098,
"grad_norm": 7.948934555053711,
"learning_rate": 8.892011283966253e-06,
"loss": 0.1889,
"step": 5760
},
{
"epoch": 1.4749488752556237,
"grad_norm": 14.392995834350586,
"learning_rate": 8.886403565594367e-06,
"loss": 0.2368,
"step": 5770
},
{
"epoch": 1.4775051124744376,
"grad_norm": 7.179086685180664,
"learning_rate": 8.88078346995415e-06,
"loss": 0.2,
"step": 5780
},
{
"epoch": 1.4800613496932515,
"grad_norm": 7.146066665649414,
"learning_rate": 8.875151014944267e-06,
"loss": 0.1678,
"step": 5790
},
{
"epoch": 1.4826175869120655,
"grad_norm": 9.944082260131836,
"learning_rate": 8.869506218502742e-06,
"loss": 0.1642,
"step": 5800
},
{
"epoch": 1.4851738241308794,
"grad_norm": 12.250117301940918,
"learning_rate": 8.863849098606907e-06,
"loss": 0.2266,
"step": 5810
},
{
"epoch": 1.4877300613496933,
"grad_norm": 29.39047622680664,
"learning_rate": 8.858179673273337e-06,
"loss": 0.2813,
"step": 5820
},
{
"epoch": 1.490286298568507,
"grad_norm": 11.951471328735352,
"learning_rate": 8.852497960557804e-06,
"loss": 0.3751,
"step": 5830
},
{
"epoch": 1.4928425357873212,
"grad_norm": 1.6928082704544067,
"learning_rate": 8.846803978555203e-06,
"loss": 0.1711,
"step": 5840
},
{
"epoch": 1.4953987730061349,
"grad_norm": 8.914717674255371,
"learning_rate": 8.84109774539951e-06,
"loss": 0.2084,
"step": 5850
},
{
"epoch": 1.4979550102249488,
"grad_norm": 9.57482624053955,
"learning_rate": 8.835379279263718e-06,
"loss": 0.2722,
"step": 5860
},
{
"epoch": 1.5005112474437627,
"grad_norm": 11.420355796813965,
"learning_rate": 8.829648598359775e-06,
"loss": 0.1593,
"step": 5870
},
{
"epoch": 1.5030674846625767,
"grad_norm": 4.315236568450928,
"learning_rate": 8.823905720938534e-06,
"loss": 0.1693,
"step": 5880
},
{
"epoch": 1.5056237218813906,
"grad_norm": 4.3361945152282715,
"learning_rate": 8.81815066528969e-06,
"loss": 0.164,
"step": 5890
},
{
"epoch": 1.5081799591002045,
"grad_norm": 9.296090126037598,
"learning_rate": 8.812383449741724e-06,
"loss": 0.1611,
"step": 5900
},
{
"epoch": 1.5107361963190185,
"grad_norm": 16.11349105834961,
"learning_rate": 8.806604092661839e-06,
"loss": 0.1636,
"step": 5910
},
{
"epoch": 1.5132924335378322,
"grad_norm": 12.905272483825684,
"learning_rate": 8.800812612455909e-06,
"loss": 0.1995,
"step": 5920
},
{
"epoch": 1.5158486707566463,
"grad_norm": 4.522705554962158,
"learning_rate": 8.79500902756842e-06,
"loss": 0.115,
"step": 5930
},
{
"epoch": 1.51840490797546,
"grad_norm": 8.156167984008789,
"learning_rate": 8.789193356482401e-06,
"loss": 0.1444,
"step": 5940
},
{
"epoch": 1.5209611451942742,
"grad_norm": 6.0793328285217285,
"learning_rate": 8.783365617719382e-06,
"loss": 0.1781,
"step": 5950
},
{
"epoch": 1.5235173824130879,
"grad_norm": 8.255613327026367,
"learning_rate": 8.777525829839317e-06,
"loss": 0.2307,
"step": 5960
},
{
"epoch": 1.5260736196319018,
"grad_norm": 13.122941017150879,
"learning_rate": 8.77167401144054e-06,
"loss": 0.1803,
"step": 5970
},
{
"epoch": 1.5286298568507157,
"grad_norm": 4.706987380981445,
"learning_rate": 8.765810181159696e-06,
"loss": 0.1343,
"step": 5980
},
{
"epoch": 1.5311860940695297,
"grad_norm": 4.327836990356445,
"learning_rate": 8.759934357671685e-06,
"loss": 0.2642,
"step": 5990
},
{
"epoch": 1.5337423312883436,
"grad_norm": 5.442415714263916,
"learning_rate": 8.754046559689602e-06,
"loss": 0.2007,
"step": 6000
},
{
"epoch": 1.5362985685071575,
"grad_norm": 12.884740829467773,
"learning_rate": 8.748146805964683e-06,
"loss": 0.2029,
"step": 6010
},
{
"epoch": 1.5388548057259714,
"grad_norm": 7.4214582443237305,
"learning_rate": 8.742235115286232e-06,
"loss": 0.131,
"step": 6020
},
{
"epoch": 1.5414110429447851,
"grad_norm": 5.057283878326416,
"learning_rate": 8.736311506481579e-06,
"loss": 0.2342,
"step": 6030
},
{
"epoch": 1.5439672801635993,
"grad_norm": 11.823676109313965,
"learning_rate": 8.730375998416e-06,
"loss": 0.145,
"step": 6040
},
{
"epoch": 1.546523517382413,
"grad_norm": 8.330456733703613,
"learning_rate": 8.724428609992675e-06,
"loss": 0.1139,
"step": 6050
},
{
"epoch": 1.5490797546012272,
"grad_norm": 11.217977523803711,
"learning_rate": 8.718469360152617e-06,
"loss": 0.2302,
"step": 6060
},
{
"epoch": 1.5516359918200409,
"grad_norm": 7.306154251098633,
"learning_rate": 8.712498267874615e-06,
"loss": 0.1695,
"step": 6070
},
{
"epoch": 1.5541922290388548,
"grad_norm": 5.975497722625732,
"learning_rate": 8.706515352175173e-06,
"loss": 0.2389,
"step": 6080
},
{
"epoch": 1.5567484662576687,
"grad_norm": 7.292505264282227,
"learning_rate": 8.700520632108453e-06,
"loss": 0.305,
"step": 6090
},
{
"epoch": 1.5593047034764826,
"grad_norm": 12.038248062133789,
"learning_rate": 8.694514126766205e-06,
"loss": 0.1872,
"step": 6100
},
{
"epoch": 1.5618609406952966,
"grad_norm": 5.702522277832031,
"learning_rate": 8.688495855277718e-06,
"loss": 0.1847,
"step": 6110
},
{
"epoch": 1.5644171779141103,
"grad_norm": 6.972240447998047,
"learning_rate": 8.68246583680975e-06,
"loss": 0.177,
"step": 6120
},
{
"epoch": 1.5669734151329244,
"grad_norm": 5.465381145477295,
"learning_rate": 8.676424090566473e-06,
"loss": 0.2276,
"step": 6130
},
{
"epoch": 1.5695296523517381,
"grad_norm": 3.666998863220215,
"learning_rate": 8.670370635789407e-06,
"loss": 0.2746,
"step": 6140
},
{
"epoch": 1.5720858895705523,
"grad_norm": 1.9799798727035522,
"learning_rate": 8.66430549175736e-06,
"loss": 0.1176,
"step": 6150
},
{
"epoch": 1.574642126789366,
"grad_norm": 5.453342437744141,
"learning_rate": 8.65822867778637e-06,
"loss": 0.2283,
"step": 6160
},
{
"epoch": 1.57719836400818,
"grad_norm": 5.7280683517456055,
"learning_rate": 8.652140213229642e-06,
"loss": 0.1838,
"step": 6170
},
{
"epoch": 1.5797546012269938,
"grad_norm": 5.071581840515137,
"learning_rate": 8.64604011747748e-06,
"loss": 0.179,
"step": 6180
},
{
"epoch": 1.5823108384458078,
"grad_norm": 1.5993189811706543,
"learning_rate": 8.639928409957236e-06,
"loss": 0.222,
"step": 6190
},
{
"epoch": 1.5848670756646217,
"grad_norm": 5.141691207885742,
"learning_rate": 8.63380511013324e-06,
"loss": 0.2307,
"step": 6200
},
{
"epoch": 1.5874233128834356,
"grad_norm": 8.022561073303223,
"learning_rate": 8.627670237506742e-06,
"loss": 0.2617,
"step": 6210
},
{
"epoch": 1.5899795501022496,
"grad_norm": 7.5429301261901855,
"learning_rate": 8.621523811615848e-06,
"loss": 0.1311,
"step": 6220
},
{
"epoch": 1.5925357873210633,
"grad_norm": 6.324619293212891,
"learning_rate": 8.615365852035456e-06,
"loss": 0.2665,
"step": 6230
},
{
"epoch": 1.5950920245398774,
"grad_norm": 5.001183032989502,
"learning_rate": 8.609196378377203e-06,
"loss": 0.205,
"step": 6240
},
{
"epoch": 1.5976482617586911,
"grad_norm": 7.617444038391113,
"learning_rate": 8.603015410289387e-06,
"loss": 0.4019,
"step": 6250
},
{
"epoch": 1.6002044989775053,
"grad_norm": 4.471902847290039,
"learning_rate": 8.596822967456915e-06,
"loss": 0.1962,
"step": 6260
},
{
"epoch": 1.602760736196319,
"grad_norm": 6.265940189361572,
"learning_rate": 8.590619069601247e-06,
"loss": 0.139,
"step": 6270
},
{
"epoch": 1.605316973415133,
"grad_norm": 6.503332614898682,
"learning_rate": 8.584403736480313e-06,
"loss": 0.1892,
"step": 6280
},
{
"epoch": 1.6078732106339468,
"grad_norm": 4.576842784881592,
"learning_rate": 8.57817698788847e-06,
"loss": 0.1271,
"step": 6290
},
{
"epoch": 1.6104294478527608,
"grad_norm": 7.434634685516357,
"learning_rate": 8.571938843656422e-06,
"loss": 0.2066,
"step": 6300
},
{
"epoch": 1.6129856850715747,
"grad_norm": 8.325051307678223,
"learning_rate": 8.565689323651174e-06,
"loss": 0.1975,
"step": 6310
},
{
"epoch": 1.6155419222903884,
"grad_norm": 7.133656978607178,
"learning_rate": 8.559428447775956e-06,
"loss": 0.1116,
"step": 6320
},
{
"epoch": 1.6180981595092025,
"grad_norm": 7.880911827087402,
"learning_rate": 8.553156235970163e-06,
"loss": 0.1743,
"step": 6330
},
{
"epoch": 1.6206543967280163,
"grad_norm": 20.269716262817383,
"learning_rate": 8.546872708209297e-06,
"loss": 0.1994,
"step": 6340
},
{
"epoch": 1.6232106339468304,
"grad_norm": 8.107951164245605,
"learning_rate": 8.54057788450489e-06,
"loss": 0.1642,
"step": 6350
},
{
"epoch": 1.6257668711656441,
"grad_norm": 5.440578937530518,
"learning_rate": 8.534271784904457e-06,
"loss": 0.1593,
"step": 6360
},
{
"epoch": 1.628323108384458,
"grad_norm": 3.178661584854126,
"learning_rate": 8.527954429491422e-06,
"loss": 0.2159,
"step": 6370
},
{
"epoch": 1.630879345603272,
"grad_norm": 5.0311055183410645,
"learning_rate": 8.521625838385052e-06,
"loss": 0.2587,
"step": 6380
},
{
"epoch": 1.633435582822086,
"grad_norm": 1.3832993507385254,
"learning_rate": 8.515286031740403e-06,
"loss": 0.1799,
"step": 6390
},
{
"epoch": 1.6359918200408998,
"grad_norm": 8.102804183959961,
"learning_rate": 8.508935029748244e-06,
"loss": 0.1516,
"step": 6400
},
{
"epoch": 1.6385480572597138,
"grad_norm": 6.02394437789917,
"learning_rate": 8.502572852635005e-06,
"loss": 0.179,
"step": 6410
},
{
"epoch": 1.6411042944785277,
"grad_norm": 6.3991312980651855,
"learning_rate": 8.4961995206627e-06,
"loss": 0.2349,
"step": 6420
},
{
"epoch": 1.6436605316973414,
"grad_norm": 5.750975608825684,
"learning_rate": 8.489815054128874e-06,
"loss": 0.1607,
"step": 6430
},
{
"epoch": 1.6462167689161555,
"grad_norm": 4.242618560791016,
"learning_rate": 8.483419473366525e-06,
"loss": 0.1986,
"step": 6440
},
{
"epoch": 1.6487730061349692,
"grad_norm": 9.25927734375,
"learning_rate": 8.477012798744056e-06,
"loss": 0.1515,
"step": 6450
},
{
"epoch": 1.6513292433537834,
"grad_norm": 0.4773276150226593,
"learning_rate": 8.470595050665196e-06,
"loss": 0.1506,
"step": 6460
},
{
"epoch": 1.653885480572597,
"grad_norm": 9.461527824401855,
"learning_rate": 8.464166249568944e-06,
"loss": 0.2223,
"step": 6470
},
{
"epoch": 1.656441717791411,
"grad_norm": 4.911471843719482,
"learning_rate": 8.457726415929494e-06,
"loss": 0.1179,
"step": 6480
},
{
"epoch": 1.658997955010225,
"grad_norm": 5.247636318206787,
"learning_rate": 8.451275570256183e-06,
"loss": 0.1667,
"step": 6490
},
{
"epoch": 1.6615541922290389,
"grad_norm": 7.205673694610596,
"learning_rate": 8.444813733093416e-06,
"loss": 0.184,
"step": 6500
},
{
"epoch": 1.6641104294478528,
"grad_norm": 12.158601760864258,
"learning_rate": 8.4383409250206e-06,
"loss": 0.1431,
"step": 6510
},
{
"epoch": 1.6666666666666665,
"grad_norm": 7.19647741317749,
"learning_rate": 8.43185716665209e-06,
"loss": 0.1936,
"step": 6520
},
{
"epoch": 1.6692229038854807,
"grad_norm": 7.732553958892822,
"learning_rate": 8.425362478637105e-06,
"loss": 0.1933,
"step": 6530
},
{
"epoch": 1.6717791411042944,
"grad_norm": 8.475358009338379,
"learning_rate": 8.418856881659677e-06,
"loss": 0.2284,
"step": 6540
},
{
"epoch": 1.6743353783231085,
"grad_norm": 11.112258911132812,
"learning_rate": 8.412340396438587e-06,
"loss": 0.1528,
"step": 6550
},
{
"epoch": 1.6768916155419222,
"grad_norm": 11.443809509277344,
"learning_rate": 8.405813043727279e-06,
"loss": 0.1782,
"step": 6560
},
{
"epoch": 1.6794478527607362,
"grad_norm": 0.7984766960144043,
"learning_rate": 8.399274844313816e-06,
"loss": 0.1205,
"step": 6570
},
{
"epoch": 1.68200408997955,
"grad_norm": 0.6593146324157715,
"learning_rate": 8.392725819020806e-06,
"loss": 0.0928,
"step": 6580
},
{
"epoch": 1.684560327198364,
"grad_norm": 7.761658668518066,
"learning_rate": 8.38616598870533e-06,
"loss": 0.1637,
"step": 6590
},
{
"epoch": 1.687116564417178,
"grad_norm": 6.802185535430908,
"learning_rate": 8.379595374258883e-06,
"loss": 0.3094,
"step": 6600
},
{
"epoch": 1.6896728016359919,
"grad_norm": 7.621953964233398,
"learning_rate": 8.373013996607309e-06,
"loss": 0.1235,
"step": 6610
},
{
"epoch": 1.6922290388548058,
"grad_norm": 5.766721248626709,
"learning_rate": 8.36642187671072e-06,
"loss": 0.1979,
"step": 6620
},
{
"epoch": 1.6947852760736195,
"grad_norm": 7.573540687561035,
"learning_rate": 8.359819035563447e-06,
"loss": 0.1544,
"step": 6630
},
{
"epoch": 1.6973415132924337,
"grad_norm": 7.856776237487793,
"learning_rate": 8.353205494193965e-06,
"loss": 0.2178,
"step": 6640
},
{
"epoch": 1.6998977505112474,
"grad_norm": 6.826193332672119,
"learning_rate": 8.346581273664826e-06,
"loss": 0.1453,
"step": 6650
},
{
"epoch": 1.7024539877300615,
"grad_norm": 3.6651082038879395,
"learning_rate": 8.339946395072593e-06,
"loss": 0.1316,
"step": 6660
},
{
"epoch": 1.7050102249488752,
"grad_norm": 13.016592025756836,
"learning_rate": 8.33330087954777e-06,
"loss": 0.2319,
"step": 6670
},
{
"epoch": 1.7075664621676891,
"grad_norm": 2.1794581413269043,
"learning_rate": 8.32664474825474e-06,
"loss": 0.096,
"step": 6680
},
{
"epoch": 1.710122699386503,
"grad_norm": 6.232535362243652,
"learning_rate": 8.319978022391692e-06,
"loss": 0.1157,
"step": 6690
},
{
"epoch": 1.712678936605317,
"grad_norm": 11.268756866455078,
"learning_rate": 8.313300723190561e-06,
"loss": 0.1155,
"step": 6700
},
{
"epoch": 1.715235173824131,
"grad_norm": 7.64271879196167,
"learning_rate": 8.306612871916946e-06,
"loss": 0.1295,
"step": 6710
},
{
"epoch": 1.7177914110429446,
"grad_norm": 3.3692967891693115,
"learning_rate": 8.299914489870065e-06,
"loss": 0.1837,
"step": 6720
},
{
"epoch": 1.7203476482617588,
"grad_norm": 3.621946096420288,
"learning_rate": 8.293205598382662e-06,
"loss": 0.116,
"step": 6730
},
{
"epoch": 1.7229038854805725,
"grad_norm": 7.414484024047852,
"learning_rate": 8.28648621882096e-06,
"loss": 0.2422,
"step": 6740
},
{
"epoch": 1.7254601226993866,
"grad_norm": 8.968006134033203,
"learning_rate": 8.279756372584575e-06,
"loss": 0.1423,
"step": 6750
},
{
"epoch": 1.7280163599182004,
"grad_norm": 5.072629451751709,
"learning_rate": 8.273016081106468e-06,
"loss": 0.1433,
"step": 6760
},
{
"epoch": 1.7305725971370143,
"grad_norm": 8.455986976623535,
"learning_rate": 8.266265365852854e-06,
"loss": 0.2221,
"step": 6770
},
{
"epoch": 1.7331288343558282,
"grad_norm": 7.337911128997803,
"learning_rate": 8.259504248323155e-06,
"loss": 0.0976,
"step": 6780
},
{
"epoch": 1.7356850715746421,
"grad_norm": 7.0469207763671875,
"learning_rate": 8.252732750049918e-06,
"loss": 0.1134,
"step": 6790
},
{
"epoch": 1.738241308793456,
"grad_norm": 6.939335823059082,
"learning_rate": 8.245950892598746e-06,
"loss": 0.1975,
"step": 6800
},
{
"epoch": 1.74079754601227,
"grad_norm": 3.959833860397339,
"learning_rate": 8.23915869756824e-06,
"loss": 0.1229,
"step": 6810
},
{
"epoch": 1.743353783231084,
"grad_norm": 9.389518737792969,
"learning_rate": 8.23235618658992e-06,
"loss": 0.1164,
"step": 6820
},
{
"epoch": 1.7459100204498976,
"grad_norm": 3.3109939098358154,
"learning_rate": 8.225543381328162e-06,
"loss": 0.1659,
"step": 6830
},
{
"epoch": 1.7484662576687118,
"grad_norm": 4.770479202270508,
"learning_rate": 8.218720303480124e-06,
"loss": 0.1385,
"step": 6840
},
{
"epoch": 1.7510224948875255,
"grad_norm": 3.3656115531921387,
"learning_rate": 8.211886974775682e-06,
"loss": 0.2088,
"step": 6850
},
{
"epoch": 1.7535787321063396,
"grad_norm": 5.787675857543945,
"learning_rate": 8.205043416977358e-06,
"loss": 0.0627,
"step": 6860
},
{
"epoch": 1.7561349693251533,
"grad_norm": 5.655759334564209,
"learning_rate": 8.198189651880253e-06,
"loss": 0.1626,
"step": 6870
},
{
"epoch": 1.7586912065439673,
"grad_norm": 5.212615966796875,
"learning_rate": 8.191325701311971e-06,
"loss": 0.1073,
"step": 6880
},
{
"epoch": 1.7612474437627812,
"grad_norm": 5.487759113311768,
"learning_rate": 8.18445158713256e-06,
"loss": 0.1968,
"step": 6890
},
{
"epoch": 1.7638036809815951,
"grad_norm": 13.81961727142334,
"learning_rate": 8.17756733123443e-06,
"loss": 0.1275,
"step": 6900
},
{
"epoch": 1.766359918200409,
"grad_norm": 5.11100959777832,
"learning_rate": 8.170672955542299e-06,
"loss": 0.183,
"step": 6910
},
{
"epoch": 1.7689161554192228,
"grad_norm": 1.606713056564331,
"learning_rate": 8.163768482013106e-06,
"loss": 0.0828,
"step": 6920
},
{
"epoch": 1.771472392638037,
"grad_norm": 5.141575813293457,
"learning_rate": 8.156853932635955e-06,
"loss": 0.1193,
"step": 6930
},
{
"epoch": 1.7740286298568506,
"grad_norm": 11.083499908447266,
"learning_rate": 8.149929329432032e-06,
"loss": 0.2004,
"step": 6940
},
{
"epoch": 1.7765848670756648,
"grad_norm": 10.328533172607422,
"learning_rate": 8.14299469445455e-06,
"loss": 0.0874,
"step": 6950
},
{
"epoch": 1.7791411042944785,
"grad_norm": 6.226305961608887,
"learning_rate": 8.136050049788666e-06,
"loss": 0.103,
"step": 6960
},
{
"epoch": 1.7816973415132924,
"grad_norm": 8.67745590209961,
"learning_rate": 8.129095417551416e-06,
"loss": 0.1642,
"step": 6970
},
{
"epoch": 1.7842535787321063,
"grad_norm": 9.080946922302246,
"learning_rate": 8.122130819891645e-06,
"loss": 0.14,
"step": 6980
},
{
"epoch": 1.7868098159509203,
"grad_norm": 4.160292625427246,
"learning_rate": 8.115156278989938e-06,
"loss": 0.0769,
"step": 6990
},
{
"epoch": 1.7893660531697342,
"grad_norm": 4.340435028076172,
"learning_rate": 8.10817181705854e-06,
"loss": 0.0904,
"step": 7000
},
{
"epoch": 1.7919222903885481,
"grad_norm": 5.093479156494141,
"learning_rate": 8.101177456341301e-06,
"loss": 0.1122,
"step": 7010
},
{
"epoch": 1.794478527607362,
"grad_norm": 7.038718223571777,
"learning_rate": 8.094173219113589e-06,
"loss": 0.1572,
"step": 7020
},
{
"epoch": 1.7970347648261757,
"grad_norm": 4.94278621673584,
"learning_rate": 8.087159127682227e-06,
"loss": 0.1477,
"step": 7030
},
{
"epoch": 1.79959100204499,
"grad_norm": 1.7163784503936768,
"learning_rate": 8.080135204385425e-06,
"loss": 0.2002,
"step": 7040
},
{
"epoch": 1.8021472392638036,
"grad_norm": 8.449196815490723,
"learning_rate": 8.073101471592702e-06,
"loss": 0.2222,
"step": 7050
},
{
"epoch": 1.8047034764826178,
"grad_norm": 6.09740686416626,
"learning_rate": 8.066057951704821e-06,
"loss": 0.14,
"step": 7060
},
{
"epoch": 1.8072597137014315,
"grad_norm": 13.180371284484863,
"learning_rate": 8.059004667153713e-06,
"loss": 0.0977,
"step": 7070
},
{
"epoch": 1.8098159509202454,
"grad_norm": 7.9253058433532715,
"learning_rate": 8.051941640402406e-06,
"loss": 0.1332,
"step": 7080
},
{
"epoch": 1.8123721881390593,
"grad_norm": 8.333995819091797,
"learning_rate": 8.044868893944955e-06,
"loss": 0.1297,
"step": 7090
},
{
"epoch": 1.8149284253578732,
"grad_norm": 8.638833045959473,
"learning_rate": 8.03778645030637e-06,
"loss": 0.101,
"step": 7100
},
{
"epoch": 1.8174846625766872,
"grad_norm": 6.839685916900635,
"learning_rate": 8.030694332042548e-06,
"loss": 0.0693,
"step": 7110
},
{
"epoch": 1.8200408997955009,
"grad_norm": 7.357212066650391,
"learning_rate": 8.02359256174019e-06,
"loss": 0.066,
"step": 7120
},
{
"epoch": 1.822597137014315,
"grad_norm": 4.24409294128418,
"learning_rate": 8.01648116201674e-06,
"loss": 0.1837,
"step": 7130
},
{
"epoch": 1.8251533742331287,
"grad_norm": 8.311896324157715,
"learning_rate": 8.009360155520313e-06,
"loss": 0.1389,
"step": 7140
},
{
"epoch": 1.8277096114519429,
"grad_norm": 12.251752853393555,
"learning_rate": 8.002229564929616e-06,
"loss": 0.111,
"step": 7150
},
{
"epoch": 1.8302658486707566,
"grad_norm": 5.574610233306885,
"learning_rate": 7.995089412953875e-06,
"loss": 0.1158,
"step": 7160
},
{
"epoch": 1.8328220858895705,
"grad_norm": 8.057143211364746,
"learning_rate": 7.987939722332776e-06,
"loss": 0.094,
"step": 7170
},
{
"epoch": 1.8353783231083844,
"grad_norm": 11.000237464904785,
"learning_rate": 7.980780515836377e-06,
"loss": 0.1,
"step": 7180
},
{
"epoch": 1.8379345603271984,
"grad_norm": 5.534488201141357,
"learning_rate": 7.97361181626504e-06,
"loss": 0.2236,
"step": 7190
},
{
"epoch": 1.8404907975460123,
"grad_norm": 6.447413444519043,
"learning_rate": 7.966433646449364e-06,
"loss": 0.2489,
"step": 7200
},
{
"epoch": 1.8430470347648262,
"grad_norm": 2.375591516494751,
"learning_rate": 7.959246029250112e-06,
"loss": 0.0896,
"step": 7210
},
{
"epoch": 1.8456032719836402,
"grad_norm": 7.849663734436035,
"learning_rate": 7.952048987558126e-06,
"loss": 0.2143,
"step": 7220
},
{
"epoch": 1.8481595092024539,
"grad_norm": 9.33170223236084,
"learning_rate": 7.944842544294268e-06,
"loss": 0.1366,
"step": 7230
},
{
"epoch": 1.850715746421268,
"grad_norm": 7.391844749450684,
"learning_rate": 7.937626722409342e-06,
"loss": 0.1979,
"step": 7240
},
{
"epoch": 1.8532719836400817,
"grad_norm": 0.42054474353790283,
"learning_rate": 7.930401544884017e-06,
"loss": 0.0991,
"step": 7250
},
{
"epoch": 1.8558282208588959,
"grad_norm": 6.135138511657715,
"learning_rate": 7.923167034728763e-06,
"loss": 0.0628,
"step": 7260
},
{
"epoch": 1.8583844580777096,
"grad_norm": 9.923365592956543,
"learning_rate": 7.915923214983767e-06,
"loss": 0.1159,
"step": 7270
},
{
"epoch": 1.8609406952965235,
"grad_norm": 7.890591144561768,
"learning_rate": 7.908670108718868e-06,
"loss": 0.1056,
"step": 7280
},
{
"epoch": 1.8634969325153374,
"grad_norm": 1.3051115274429321,
"learning_rate": 7.90140773903348e-06,
"loss": 0.1308,
"step": 7290
},
{
"epoch": 1.8660531697341514,
"grad_norm": 7.580386161804199,
"learning_rate": 7.894136129056516e-06,
"loss": 0.1585,
"step": 7300
},
{
"epoch": 1.8686094069529653,
"grad_norm": 4.543681621551514,
"learning_rate": 7.886855301946322e-06,
"loss": 0.0982,
"step": 7310
},
{
"epoch": 1.871165644171779,
"grad_norm": 8.670321464538574,
"learning_rate": 7.879565280890593e-06,
"loss": 0.1984,
"step": 7320
},
{
"epoch": 1.8737218813905931,
"grad_norm": 10.790763854980469,
"learning_rate": 7.872266089106309e-06,
"loss": 0.0939,
"step": 7330
},
{
"epoch": 1.8762781186094069,
"grad_norm": 5.62462854385376,
"learning_rate": 7.864957749839653e-06,
"loss": 0.125,
"step": 7340
},
{
"epoch": 1.878834355828221,
"grad_norm": 5.0767717361450195,
"learning_rate": 7.857640286365946e-06,
"loss": 0.1439,
"step": 7350
},
{
"epoch": 1.8813905930470347,
"grad_norm": 4.852501392364502,
"learning_rate": 7.850313721989558e-06,
"loss": 0.2335,
"step": 7360
},
{
"epoch": 1.8839468302658486,
"grad_norm": 8.364299774169922,
"learning_rate": 7.842978080043855e-06,
"loss": 0.138,
"step": 7370
},
{
"epoch": 1.8865030674846626,
"grad_norm": 16.219741821289062,
"learning_rate": 7.835633383891102e-06,
"loss": 0.1868,
"step": 7380
},
{
"epoch": 1.8890593047034765,
"grad_norm": 6.5828375816345215,
"learning_rate": 7.828279656922408e-06,
"loss": 0.1366,
"step": 7390
},
{
"epoch": 1.8916155419222904,
"grad_norm": 11.096482276916504,
"learning_rate": 7.820916922557636e-06,
"loss": 0.0636,
"step": 7400
},
{
"epoch": 1.8941717791411041,
"grad_norm": 7.485594749450684,
"learning_rate": 7.813545204245341e-06,
"loss": 0.2255,
"step": 7410
},
{
"epoch": 1.8967280163599183,
"grad_norm": 8.4011869430542,
"learning_rate": 7.806164525462687e-06,
"loss": 0.1484,
"step": 7420
},
{
"epoch": 1.899284253578732,
"grad_norm": 7.423107624053955,
"learning_rate": 7.798774909715374e-06,
"loss": 0.1592,
"step": 7430
},
{
"epoch": 1.9018404907975461,
"grad_norm": 5.52902364730835,
"learning_rate": 7.791376380537567e-06,
"loss": 0.0735,
"step": 7440
},
{
"epoch": 1.9043967280163598,
"grad_norm": 3.4649152755737305,
"learning_rate": 7.783968961491818e-06,
"loss": 0.1479,
"step": 7450
},
{
"epoch": 1.9069529652351738,
"grad_norm": 7.678995132446289,
"learning_rate": 7.776552676168987e-06,
"loss": 0.2274,
"step": 7460
},
{
"epoch": 1.9095092024539877,
"grad_norm": 0.06970912218093872,
"learning_rate": 7.769127548188174e-06,
"loss": 0.1003,
"step": 7470
},
{
"epoch": 1.9120654396728016,
"grad_norm": 17.272754669189453,
"learning_rate": 7.761693601196642e-06,
"loss": 0.0924,
"step": 7480
},
{
"epoch": 1.9146216768916156,
"grad_norm": 5.135831832885742,
"learning_rate": 7.75425085886974e-06,
"loss": 0.1176,
"step": 7490
},
{
"epoch": 1.9171779141104295,
"grad_norm": 5.651144981384277,
"learning_rate": 7.746799344910822e-06,
"loss": 0.1398,
"step": 7500
},
{
"epoch": 1.9197341513292434,
"grad_norm": 6.184920787811279,
"learning_rate": 7.739339083051186e-06,
"loss": 0.1766,
"step": 7510
},
{
"epoch": 1.9222903885480571,
"grad_norm": 6.632026672363281,
"learning_rate": 7.73187009704999e-06,
"loss": 0.1427,
"step": 7520
},
{
"epoch": 1.9248466257668713,
"grad_norm": 8.19317626953125,
"learning_rate": 7.724392410694167e-06,
"loss": 0.1126,
"step": 7530
},
{
"epoch": 1.927402862985685,
"grad_norm": 1.4213460683822632,
"learning_rate": 7.716906047798364e-06,
"loss": 0.1248,
"step": 7540
},
{
"epoch": 1.9299591002044991,
"grad_norm": 8.21669864654541,
"learning_rate": 7.709411032204868e-06,
"loss": 0.1148,
"step": 7550
},
{
"epoch": 1.9325153374233128,
"grad_norm": 6.994448661804199,
"learning_rate": 7.701907387783509e-06,
"loss": 0.1548,
"step": 7560
},
{
"epoch": 1.9350715746421268,
"grad_norm": 7.743505954742432,
"learning_rate": 7.694395138431608e-06,
"loss": 0.1274,
"step": 7570
},
{
"epoch": 1.9376278118609407,
"grad_norm": 5.287206172943115,
"learning_rate": 7.686874308073885e-06,
"loss": 0.0779,
"step": 7580
},
{
"epoch": 1.9401840490797546,
"grad_norm": 15.280442237854004,
"learning_rate": 7.679344920662394e-06,
"loss": 0.0718,
"step": 7590
},
{
"epoch": 1.9427402862985685,
"grad_norm": 9.01176929473877,
"learning_rate": 7.671807000176434e-06,
"loss": 0.2102,
"step": 7600
},
{
"epoch": 1.9452965235173822,
"grad_norm": 4.649341583251953,
"learning_rate": 7.664260570622487e-06,
"loss": 0.1391,
"step": 7610
},
{
"epoch": 1.9478527607361964,
"grad_norm": 9.841882705688477,
"learning_rate": 7.656705656034132e-06,
"loss": 0.1092,
"step": 7620
},
{
"epoch": 1.95040899795501,
"grad_norm": 4.586430072784424,
"learning_rate": 7.649142280471964e-06,
"loss": 0.1478,
"step": 7630
},
{
"epoch": 1.9529652351738243,
"grad_norm": 3.7405037879943848,
"learning_rate": 7.641570468023536e-06,
"loss": 0.157,
"step": 7640
},
{
"epoch": 1.955521472392638,
"grad_norm": 4.911379337310791,
"learning_rate": 7.633990242803263e-06,
"loss": 0.0739,
"step": 7650
},
{
"epoch": 1.9580777096114519,
"grad_norm": 14.748944282531738,
"learning_rate": 7.626401628952352e-06,
"loss": 0.1426,
"step": 7660
},
{
"epoch": 1.9606339468302658,
"grad_norm": 7.726930141448975,
"learning_rate": 7.61880465063873e-06,
"loss": 0.1107,
"step": 7670
},
{
"epoch": 1.9631901840490797,
"grad_norm": 7.120655059814453,
"learning_rate": 7.61119933205696e-06,
"loss": 0.044,
"step": 7680
},
{
"epoch": 1.9657464212678937,
"grad_norm": 1.3341773748397827,
"learning_rate": 7.603585697428169e-06,
"loss": 0.0847,
"step": 7690
},
{
"epoch": 1.9683026584867076,
"grad_norm": 5.155945777893066,
"learning_rate": 7.595963770999966e-06,
"loss": 0.2069,
"step": 7700
},
{
"epoch": 1.9708588957055215,
"grad_norm": 6.961178302764893,
"learning_rate": 7.588333577046368e-06,
"loss": 0.1673,
"step": 7710
},
{
"epoch": 1.9734151329243352,
"grad_norm": 5.766995906829834,
"learning_rate": 7.5806951398677255e-06,
"loss": 0.1469,
"step": 7720
},
{
"epoch": 1.9759713701431494,
"grad_norm": 11.985013961791992,
"learning_rate": 7.573048483790635e-06,
"loss": 0.1621,
"step": 7730
},
{
"epoch": 1.978527607361963,
"grad_norm": 0.5396488308906555,
"learning_rate": 7.565393633167876e-06,
"loss": 0.0574,
"step": 7740
},
{
"epoch": 1.9810838445807772,
"grad_norm": 11.865923881530762,
"learning_rate": 7.557730612378318e-06,
"loss": 0.1207,
"step": 7750
},
{
"epoch": 1.983640081799591,
"grad_norm": 4.756693363189697,
"learning_rate": 7.5500594458268576e-06,
"loss": 0.1147,
"step": 7760
},
{
"epoch": 1.9861963190184049,
"grad_norm": 4.860601425170898,
"learning_rate": 7.542380157944328e-06,
"loss": 0.0956,
"step": 7770
},
{
"epoch": 1.9887525562372188,
"grad_norm": 14.664186477661133,
"learning_rate": 7.534692773187431e-06,
"loss": 0.1399,
"step": 7780
},
{
"epoch": 1.9913087934560327,
"grad_norm": 4.663970470428467,
"learning_rate": 7.526997316038654e-06,
"loss": 0.0859,
"step": 7790
},
{
"epoch": 1.9938650306748467,
"grad_norm": 0.8506277203559875,
"learning_rate": 7.519293811006187e-06,
"loss": 0.136,
"step": 7800
},
{
"epoch": 1.9964212678936604,
"grad_norm": 5.4818644523620605,
"learning_rate": 7.511582282623865e-06,
"loss": 0.0835,
"step": 7810
},
{
"epoch": 1.9989775051124745,
"grad_norm": 5.375784397125244,
"learning_rate": 7.503862755451059e-06,
"loss": 0.1255,
"step": 7820
},
{
"epoch": 2.0015337423312882,
"grad_norm": 1.3432427644729614,
"learning_rate": 7.4961352540726274e-06,
"loss": 0.0644,
"step": 7830
},
{
"epoch": 2.0040899795501024,
"grad_norm": 8.615415573120117,
"learning_rate": 7.4883998030988136e-06,
"loss": 0.1136,
"step": 7840
},
{
"epoch": 2.006646216768916,
"grad_norm": 7.158458232879639,
"learning_rate": 7.480656427165187e-06,
"loss": 0.09,
"step": 7850
},
{
"epoch": 2.0092024539877302,
"grad_norm": 8.66907024383545,
"learning_rate": 7.47290515093255e-06,
"loss": 0.0703,
"step": 7860
},
{
"epoch": 2.011758691206544,
"grad_norm": 25.180543899536133,
"learning_rate": 7.465145999086874e-06,
"loss": 0.1314,
"step": 7870
},
{
"epoch": 2.014314928425358,
"grad_norm": 0.7909819483757019,
"learning_rate": 7.457378996339201e-06,
"loss": 0.0538,
"step": 7880
},
{
"epoch": 2.016871165644172,
"grad_norm": 0.6326285600662231,
"learning_rate": 7.4496041674255834e-06,
"loss": 0.0545,
"step": 7890
},
{
"epoch": 2.0194274028629855,
"grad_norm": 6.992855548858643,
"learning_rate": 7.441821537107e-06,
"loss": 0.0811,
"step": 7900
},
{
"epoch": 2.0219836400817996,
"grad_norm": 4.8581743240356445,
"learning_rate": 7.434031130169268e-06,
"loss": 0.0897,
"step": 7910
},
{
"epoch": 2.0245398773006134,
"grad_norm": 0.3861932158470154,
"learning_rate": 7.42623297142298e-06,
"loss": 0.0795,
"step": 7920
},
{
"epoch": 2.0270961145194275,
"grad_norm": 4.8933424949646,
"learning_rate": 7.418427085703406e-06,
"loss": 0.0746,
"step": 7930
},
{
"epoch": 2.029652351738241,
"grad_norm": 7.480552673339844,
"learning_rate": 7.410613497870432e-06,
"loss": 0.0816,
"step": 7940
},
{
"epoch": 2.0322085889570554,
"grad_norm": 5.1835126876831055,
"learning_rate": 7.402792232808474e-06,
"loss": 0.1248,
"step": 7950
},
{
"epoch": 2.034764826175869,
"grad_norm": 0.7514427304267883,
"learning_rate": 7.394963315426393e-06,
"loss": 0.077,
"step": 7960
},
{
"epoch": 2.037321063394683,
"grad_norm": 5.26667594909668,
"learning_rate": 7.387126770657423e-06,
"loss": 0.0694,
"step": 7970
},
{
"epoch": 2.039877300613497,
"grad_norm": 8.795965194702148,
"learning_rate": 7.379282623459093e-06,
"loss": 0.0845,
"step": 7980
},
{
"epoch": 2.0424335378323106,
"grad_norm": 5.604037284851074,
"learning_rate": 7.371430898813137e-06,
"loss": 0.0753,
"step": 7990
},
{
"epoch": 2.044989775051125,
"grad_norm": 2.7282750606536865,
"learning_rate": 7.363571621725427e-06,
"loss": 0.031,
"step": 8000
},
{
"epoch": 2.0475460122699385,
"grad_norm": 5.139689922332764,
"learning_rate": 7.355704817225886e-06,
"loss": 0.1,
"step": 8010
},
{
"epoch": 2.0501022494887526,
"grad_norm": 7.020951271057129,
"learning_rate": 7.347830510368409e-06,
"loss": 0.0798,
"step": 8020
},
{
"epoch": 2.0526584867075663,
"grad_norm": 2.1761505603790283,
"learning_rate": 7.3399487262307866e-06,
"loss": 0.0768,
"step": 8030
},
{
"epoch": 2.0552147239263805,
"grad_norm": 5.854605197906494,
"learning_rate": 7.332059489914619e-06,
"loss": 0.0601,
"step": 8040
},
{
"epoch": 2.057770961145194,
"grad_norm": 0.5980772376060486,
"learning_rate": 7.324162826545245e-06,
"loss": 0.0586,
"step": 8050
},
{
"epoch": 2.0603271983640083,
"grad_norm": 4.8323235511779785,
"learning_rate": 7.316258761271651e-06,
"loss": 0.0578,
"step": 8060
},
{
"epoch": 2.062883435582822,
"grad_norm": 8.097885131835938,
"learning_rate": 7.308347319266401e-06,
"loss": 0.0469,
"step": 8070
},
{
"epoch": 2.065439672801636,
"grad_norm": 5.477297782897949,
"learning_rate": 7.300428525725549e-06,
"loss": 0.0597,
"step": 8080
},
{
"epoch": 2.06799591002045,
"grad_norm": 2.20831298828125,
"learning_rate": 7.2925024058685664e-06,
"loss": 0.0512,
"step": 8090
},
{
"epoch": 2.0705521472392636,
"grad_norm": 6.855231761932373,
"learning_rate": 7.2845689849382514e-06,
"loss": 0.0787,
"step": 8100
},
{
"epoch": 2.0731083844580778,
"grad_norm": 9.492572784423828,
"learning_rate": 7.27662828820066e-06,
"loss": 0.104,
"step": 8110
},
{
"epoch": 2.0756646216768915,
"grad_norm": 7.048098087310791,
"learning_rate": 7.268680340945016e-06,
"loss": 0.1052,
"step": 8120
},
{
"epoch": 2.0782208588957056,
"grad_norm": 7.1551594734191895,
"learning_rate": 7.260725168483634e-06,
"loss": 0.0538,
"step": 8130
},
{
"epoch": 2.0807770961145193,
"grad_norm": 3.1020727157592773,
"learning_rate": 7.252762796151843e-06,
"loss": 0.0923,
"step": 8140
},
{
"epoch": 2.0833333333333335,
"grad_norm": 6.914649963378906,
"learning_rate": 7.2447932493079e-06,
"loss": 0.0458,
"step": 8150
},
{
"epoch": 2.085889570552147,
"grad_norm": 1.941754698753357,
"learning_rate": 7.236816553332909e-06,
"loss": 0.0847,
"step": 8160
},
{
"epoch": 2.0884458077709613,
"grad_norm": 3.0333592891693115,
"learning_rate": 7.228832733630742e-06,
"loss": 0.0318,
"step": 8170
},
{
"epoch": 2.091002044989775,
"grad_norm": 2.743631601333618,
"learning_rate": 7.220841815627966e-06,
"loss": 0.0935,
"step": 8180
},
{
"epoch": 2.0935582822085887,
"grad_norm": 6.149184226989746,
"learning_rate": 7.212843824773745e-06,
"loss": 0.1325,
"step": 8190
},
{
"epoch": 2.096114519427403,
"grad_norm": 9.376814842224121,
"learning_rate": 7.204838786539772e-06,
"loss": 0.0287,
"step": 8200
},
{
"epoch": 2.0986707566462166,
"grad_norm": 6.627695560455322,
"learning_rate": 7.196826726420185e-06,
"loss": 0.1187,
"step": 8210
},
{
"epoch": 2.1012269938650308,
"grad_norm": 7.894048690795898,
"learning_rate": 7.188807669931486e-06,
"loss": 0.078,
"step": 8220
},
{
"epoch": 2.1037832310838445,
"grad_norm": 6.502098083496094,
"learning_rate": 7.180781642612453e-06,
"loss": 0.0647,
"step": 8230
},
{
"epoch": 2.1063394683026586,
"grad_norm": 5.958528995513916,
"learning_rate": 7.172748670024073e-06,
"loss": 0.0945,
"step": 8240
},
{
"epoch": 2.1088957055214723,
"grad_norm": 0.8460894823074341,
"learning_rate": 7.164708777749445e-06,
"loss": 0.0558,
"step": 8250
},
{
"epoch": 2.1114519427402865,
"grad_norm": 5.054858207702637,
"learning_rate": 7.1566619913937105e-06,
"loss": 0.1047,
"step": 8260
},
{
"epoch": 2.1140081799591,
"grad_norm": 9.798078536987305,
"learning_rate": 7.148608336583961e-06,
"loss": 0.0616,
"step": 8270
},
{
"epoch": 2.116564417177914,
"grad_norm": 2.237877607345581,
"learning_rate": 7.140547838969168e-06,
"loss": 0.0827,
"step": 8280
},
{
"epoch": 2.119120654396728,
"grad_norm": 0.3861876428127289,
"learning_rate": 7.1324805242200956e-06,
"loss": 0.0635,
"step": 8290
},
{
"epoch": 2.1216768916155417,
"grad_norm": 6.713496685028076,
"learning_rate": 7.1244064180292134e-06,
"loss": 0.0663,
"step": 8300
},
{
"epoch": 2.124233128834356,
"grad_norm": 5.54539155960083,
"learning_rate": 7.116325546110628e-06,
"loss": 0.0446,
"step": 8310
},
{
"epoch": 2.1267893660531696,
"grad_norm": 5.177070617675781,
"learning_rate": 7.108237934199983e-06,
"loss": 0.0517,
"step": 8320
},
{
"epoch": 2.1293456032719837,
"grad_norm": 0.7041372060775757,
"learning_rate": 7.1001436080544e-06,
"loss": 0.0289,
"step": 8330
},
{
"epoch": 2.1319018404907975,
"grad_norm": 6.997579574584961,
"learning_rate": 7.0920425934523705e-06,
"loss": 0.0502,
"step": 8340
},
{
"epoch": 2.1344580777096116,
"grad_norm": 9.049081802368164,
"learning_rate": 7.083934916193698e-06,
"loss": 0.0795,
"step": 8350
},
{
"epoch": 2.1370143149284253,
"grad_norm": 3.9479804039001465,
"learning_rate": 7.075820602099399e-06,
"loss": 0.0659,
"step": 8360
},
{
"epoch": 2.1395705521472395,
"grad_norm": 7.389666557312012,
"learning_rate": 7.0676996770116294e-06,
"loss": 0.0533,
"step": 8370
},
{
"epoch": 2.142126789366053,
"grad_norm": 5.052390098571777,
"learning_rate": 7.059572166793598e-06,
"loss": 0.075,
"step": 8380
},
{
"epoch": 2.144683026584867,
"grad_norm": 8.923999786376953,
"learning_rate": 7.051438097329485e-06,
"loss": 0.0782,
"step": 8390
},
{
"epoch": 2.147239263803681,
"grad_norm": 9.955076217651367,
"learning_rate": 7.043297494524364e-06,
"loss": 0.0648,
"step": 8400
},
{
"epoch": 2.1497955010224947,
"grad_norm": 14.11737060546875,
"learning_rate": 7.03515038430411e-06,
"loss": 0.0368,
"step": 8410
},
{
"epoch": 2.152351738241309,
"grad_norm": 4.5228590965271,
"learning_rate": 7.026996792615328e-06,
"loss": 0.0758,
"step": 8420
},
{
"epoch": 2.1549079754601226,
"grad_norm": 6.418432712554932,
"learning_rate": 7.0188367454252624e-06,
"loss": 0.0705,
"step": 8430
},
{
"epoch": 2.1574642126789367,
"grad_norm": 5.669915676116943,
"learning_rate": 7.010670268721718e-06,
"loss": 0.1191,
"step": 8440
},
{
"epoch": 2.1600204498977504,
"grad_norm": 5.414175033569336,
"learning_rate": 7.002497388512971e-06,
"loss": 0.0665,
"step": 8450
},
{
"epoch": 2.1625766871165646,
"grad_norm": 3.9066929817199707,
"learning_rate": 6.9943181308277e-06,
"loss": 0.0625,
"step": 8460
},
{
"epoch": 2.1651329243353783,
"grad_norm": 0.23331096768379211,
"learning_rate": 6.986132521714888e-06,
"loss": 0.0674,
"step": 8470
},
{
"epoch": 2.1676891615541924,
"grad_norm": 3.160121440887451,
"learning_rate": 6.977940587243745e-06,
"loss": 0.0834,
"step": 8480
},
{
"epoch": 2.170245398773006,
"grad_norm": 4.351058483123779,
"learning_rate": 6.969742353503635e-06,
"loss": 0.0386,
"step": 8490
},
{
"epoch": 2.17280163599182,
"grad_norm": 9.820882797241211,
"learning_rate": 6.96153784660397e-06,
"loss": 0.0672,
"step": 8500
},
{
"epoch": 2.175357873210634,
"grad_norm": 15.702372550964355,
"learning_rate": 6.9533270926741506e-06,
"loss": 0.0749,
"step": 8510
},
{
"epoch": 2.1779141104294477,
"grad_norm": 5.2401509284973145,
"learning_rate": 6.945110117863469e-06,
"loss": 0.0703,
"step": 8520
},
{
"epoch": 2.180470347648262,
"grad_norm": 5.104111194610596,
"learning_rate": 6.936886948341029e-06,
"loss": 0.091,
"step": 8530
},
{
"epoch": 2.1830265848670756,
"grad_norm": 12.119421005249023,
"learning_rate": 6.928657610295666e-06,
"loss": 0.045,
"step": 8540
},
{
"epoch": 2.1855828220858897,
"grad_norm": 3.9862372875213623,
"learning_rate": 6.920422129935859e-06,
"loss": 0.0863,
"step": 8550
},
{
"epoch": 2.1881390593047034,
"grad_norm": 8.157721519470215,
"learning_rate": 6.912180533489645e-06,
"loss": 0.0649,
"step": 8560
},
{
"epoch": 2.1906952965235176,
"grad_norm": 12.727168083190918,
"learning_rate": 6.903932847204548e-06,
"loss": 0.0839,
"step": 8570
},
{
"epoch": 2.1932515337423313,
"grad_norm": 2.1506459712982178,
"learning_rate": 6.895679097347476e-06,
"loss": 0.0704,
"step": 8580
},
{
"epoch": 2.195807770961145,
"grad_norm": 3.3470993041992188,
"learning_rate": 6.887419310204657e-06,
"loss": 0.0637,
"step": 8590
},
{
"epoch": 2.198364008179959,
"grad_norm": 4.4683356285095215,
"learning_rate": 6.879153512081542e-06,
"loss": 0.0556,
"step": 8600
},
{
"epoch": 2.200920245398773,
"grad_norm": 5.225627899169922,
"learning_rate": 6.870881729302728e-06,
"loss": 0.0467,
"step": 8610
},
{
"epoch": 2.203476482617587,
"grad_norm": 4.654438018798828,
"learning_rate": 6.862603988211866e-06,
"loss": 0.117,
"step": 8620
},
{
"epoch": 2.2060327198364007,
"grad_norm": 5.765674114227295,
"learning_rate": 6.854320315171591e-06,
"loss": 0.0833,
"step": 8630
},
{
"epoch": 2.208588957055215,
"grad_norm": 4.424642086029053,
"learning_rate": 6.8460307365634225e-06,
"loss": 0.0879,
"step": 8640
},
{
"epoch": 2.2111451942740286,
"grad_norm": 0.613856315612793,
"learning_rate": 6.837735278787694e-06,
"loss": 0.0309,
"step": 8650
},
{
"epoch": 2.2137014314928427,
"grad_norm": 6.912576675415039,
"learning_rate": 6.829433968263458e-06,
"loss": 0.0571,
"step": 8660
},
{
"epoch": 2.2162576687116564,
"grad_norm": 4.189841270446777,
"learning_rate": 6.821126831428408e-06,
"loss": 0.0856,
"step": 8670
},
{
"epoch": 2.21881390593047,
"grad_norm": 2.174213171005249,
"learning_rate": 6.8128138947387966e-06,
"loss": 0.0573,
"step": 8680
},
{
"epoch": 2.2213701431492843,
"grad_norm": 9.983304023742676,
"learning_rate": 6.80449518466934e-06,
"loss": 0.0715,
"step": 8690
},
{
"epoch": 2.223926380368098,
"grad_norm": 5.989863872528076,
"learning_rate": 6.796170727713147e-06,
"loss": 0.0759,
"step": 8700
},
{
"epoch": 2.226482617586912,
"grad_norm": 2.114159345626831,
"learning_rate": 6.787840550381628e-06,
"loss": 0.0244,
"step": 8710
},
{
"epoch": 2.229038854805726,
"grad_norm": 7.211903095245361,
"learning_rate": 6.779504679204412e-06,
"loss": 0.0973,
"step": 8720
},
{
"epoch": 2.23159509202454,
"grad_norm": 5.518008232116699,
"learning_rate": 6.771163140729257e-06,
"loss": 0.1189,
"step": 8730
},
{
"epoch": 2.2341513292433537,
"grad_norm": 7.159096717834473,
"learning_rate": 6.762815961521976e-06,
"loss": 0.0472,
"step": 8740
},
{
"epoch": 2.236707566462168,
"grad_norm": 5.628960132598877,
"learning_rate": 6.754463168166342e-06,
"loss": 0.0646,
"step": 8750
},
{
"epoch": 2.2392638036809815,
"grad_norm": 2.7872536182403564,
"learning_rate": 6.746104787264011e-06,
"loss": 0.0603,
"step": 8760
},
{
"epoch": 2.2418200408997957,
"grad_norm": 4.476940155029297,
"learning_rate": 6.737740845434432e-06,
"loss": 0.0635,
"step": 8770
},
{
"epoch": 2.2443762781186094,
"grad_norm": 4.041048049926758,
"learning_rate": 6.7293713693147635e-06,
"loss": 0.0462,
"step": 8780
},
{
"epoch": 2.246932515337423,
"grad_norm": 3.1163430213928223,
"learning_rate": 6.720996385559793e-06,
"loss": 0.0552,
"step": 8790
},
{
"epoch": 2.2494887525562373,
"grad_norm": 5.569158554077148,
"learning_rate": 6.712615920841843e-06,
"loss": 0.0689,
"step": 8800
},
{
"epoch": 2.252044989775051,
"grad_norm": 5.894398212432861,
"learning_rate": 6.704230001850696e-06,
"loss": 0.0531,
"step": 8810
},
{
"epoch": 2.254601226993865,
"grad_norm": 6.340700149536133,
"learning_rate": 6.695838655293505e-06,
"loss": 0.0568,
"step": 8820
},
{
"epoch": 2.257157464212679,
"grad_norm": 4.014859199523926,
"learning_rate": 6.6874419078947076e-06,
"loss": 0.0613,
"step": 8830
},
{
"epoch": 2.259713701431493,
"grad_norm": 10.440202713012695,
"learning_rate": 6.679039786395936e-06,
"loss": 0.0497,
"step": 8840
},
{
"epoch": 2.2622699386503067,
"grad_norm": 9.94273567199707,
"learning_rate": 6.6706323175559504e-06,
"loss": 0.0866,
"step": 8850
},
{
"epoch": 2.264826175869121,
"grad_norm": 1.108022689819336,
"learning_rate": 6.662219528150529e-06,
"loss": 0.0504,
"step": 8860
},
{
"epoch": 2.2673824130879345,
"grad_norm": 3.8868322372436523,
"learning_rate": 6.653801444972398e-06,
"loss": 0.0675,
"step": 8870
},
{
"epoch": 2.2699386503067487,
"grad_norm": 3.9967801570892334,
"learning_rate": 6.64537809483115e-06,
"loss": 0.0846,
"step": 8880
},
{
"epoch": 2.2724948875255624,
"grad_norm": 4.602581024169922,
"learning_rate": 6.63694950455314e-06,
"loss": 0.0553,
"step": 8890
},
{
"epoch": 2.275051124744376,
"grad_norm": 0.8123490810394287,
"learning_rate": 6.628515700981424e-06,
"loss": 0.0463,
"step": 8900
},
{
"epoch": 2.2776073619631902,
"grad_norm": 0.3097759783267975,
"learning_rate": 6.620076710975648e-06,
"loss": 0.0754,
"step": 8910
},
{
"epoch": 2.280163599182004,
"grad_norm": 4.588552474975586,
"learning_rate": 6.611632561411987e-06,
"loss": 0.078,
"step": 8920
},
{
"epoch": 2.282719836400818,
"grad_norm": 7.720053195953369,
"learning_rate": 6.603183279183041e-06,
"loss": 0.0946,
"step": 8930
},
{
"epoch": 2.285276073619632,
"grad_norm": 6.0510945320129395,
"learning_rate": 6.594728891197758e-06,
"loss": 0.0565,
"step": 8940
},
{
"epoch": 2.287832310838446,
"grad_norm": 9.991423606872559,
"learning_rate": 6.586269424381349e-06,
"loss": 0.0585,
"step": 8950
},
{
"epoch": 2.2903885480572597,
"grad_norm": 0.4170069098472595,
"learning_rate": 6.577804905675196e-06,
"loss": 0.0552,
"step": 8960
},
{
"epoch": 2.292944785276074,
"grad_norm": 4.831167221069336,
"learning_rate": 6.569335362036773e-06,
"loss": 0.0477,
"step": 8970
},
{
"epoch": 2.2955010224948875,
"grad_norm": 2.8550946712493896,
"learning_rate": 6.560860820439557e-06,
"loss": 0.0386,
"step": 8980
},
{
"epoch": 2.2980572597137012,
"grad_norm": 6.75853967666626,
"learning_rate": 6.55238130787294e-06,
"loss": 0.0722,
"step": 8990
},
{
"epoch": 2.3006134969325154,
"grad_norm": 7.144791603088379,
"learning_rate": 6.543896851342148e-06,
"loss": 0.0713,
"step": 9000
},
{
"epoch": 2.303169734151329,
"grad_norm": 0.21670909225940704,
"learning_rate": 6.535407477868151e-06,
"loss": 0.0809,
"step": 9010
},
{
"epoch": 2.3057259713701432,
"grad_norm": 4.624710559844971,
"learning_rate": 6.526913214487578e-06,
"loss": 0.0727,
"step": 9020
},
{
"epoch": 2.308282208588957,
"grad_norm": 0.8301081657409668,
"learning_rate": 6.518414088252632e-06,
"loss": 0.0522,
"step": 9030
},
{
"epoch": 2.310838445807771,
"grad_norm": 0.1696474701166153,
"learning_rate": 6.509910126231003e-06,
"loss": 0.0482,
"step": 9040
},
{
"epoch": 2.313394683026585,
"grad_norm": 2.2531793117523193,
"learning_rate": 6.501401355505782e-06,
"loss": 0.0557,
"step": 9050
},
{
"epoch": 2.315950920245399,
"grad_norm": 4.994187831878662,
"learning_rate": 6.492887803175374e-06,
"loss": 0.0938,
"step": 9060
},
{
"epoch": 2.3185071574642127,
"grad_norm": 6.800015926361084,
"learning_rate": 6.484369496353412e-06,
"loss": 0.061,
"step": 9070
},
{
"epoch": 2.3210633946830264,
"grad_norm": 10.822134017944336,
"learning_rate": 6.4758464621686715e-06,
"loss": 0.0584,
"step": 9080
},
{
"epoch": 2.3236196319018405,
"grad_norm": 2.0743789672851562,
"learning_rate": 6.467318727764983e-06,
"loss": 0.0489,
"step": 9090
},
{
"epoch": 2.326175869120654,
"grad_norm": 6.299098968505859,
"learning_rate": 6.458786320301146e-06,
"loss": 0.0832,
"step": 9100
},
{
"epoch": 2.3287321063394684,
"grad_norm": 0.3529964089393616,
"learning_rate": 6.450249266950846e-06,
"loss": 0.0281,
"step": 9110
},
{
"epoch": 2.331288343558282,
"grad_norm": 1.3568120002746582,
"learning_rate": 6.4417075949025575e-06,
"loss": 0.0326,
"step": 9120
},
{
"epoch": 2.3338445807770962,
"grad_norm": 5.631685256958008,
"learning_rate": 6.43316133135947e-06,
"loss": 0.0618,
"step": 9130
},
{
"epoch": 2.33640081799591,
"grad_norm": 0.5822303295135498,
"learning_rate": 6.4246105035393965e-06,
"loss": 0.0483,
"step": 9140
},
{
"epoch": 2.338957055214724,
"grad_norm": 0.39102593064308167,
"learning_rate": 6.416055138674682e-06,
"loss": 0.0429,
"step": 9150
},
{
"epoch": 2.341513292433538,
"grad_norm": 4.372485637664795,
"learning_rate": 6.4074952640121226e-06,
"loss": 0.0795,
"step": 9160
},
{
"epoch": 2.3440695296523515,
"grad_norm": 0.6967136263847351,
"learning_rate": 6.398930906812877e-06,
"loss": 0.0307,
"step": 9170
},
{
"epoch": 2.3466257668711656,
"grad_norm": 5.449244022369385,
"learning_rate": 6.390362094352382e-06,
"loss": 0.0729,
"step": 9180
},
{
"epoch": 2.34918200408998,
"grad_norm": 5.0675482749938965,
"learning_rate": 6.3817888539202595e-06,
"loss": 0.0707,
"step": 9190
},
{
"epoch": 2.3517382413087935,
"grad_norm": 2.9963107109069824,
"learning_rate": 6.373211212820237e-06,
"loss": 0.0545,
"step": 9200
},
{
"epoch": 2.354294478527607,
"grad_norm": 3.9092769622802734,
"learning_rate": 6.364629198370054e-06,
"loss": 0.0281,
"step": 9210
},
{
"epoch": 2.3568507157464214,
"grad_norm": 8.632110595703125,
"learning_rate": 6.3560428379013795e-06,
"loss": 0.0994,
"step": 9220
},
{
"epoch": 2.359406952965235,
"grad_norm": 3.0046439170837402,
"learning_rate": 6.3474521587597234e-06,
"loss": 0.0505,
"step": 9230
},
{
"epoch": 2.361963190184049,
"grad_norm": 2.9390039443969727,
"learning_rate": 6.3388571883043505e-06,
"loss": 0.0561,
"step": 9240
},
{
"epoch": 2.364519427402863,
"grad_norm": 7.934990406036377,
"learning_rate": 6.330257953908192e-06,
"loss": 0.0442,
"step": 9250
},
{
"epoch": 2.367075664621677,
"grad_norm": 3.6421031951904297,
"learning_rate": 6.321654482957756e-06,
"loss": 0.0761,
"step": 9260
},
{
"epoch": 2.3696319018404908,
"grad_norm": 2.067728042602539,
"learning_rate": 6.313046802853047e-06,
"loss": 0.0361,
"step": 9270
},
{
"epoch": 2.372188139059305,
"grad_norm": 0.5931568741798401,
"learning_rate": 6.304434941007473e-06,
"loss": 0.0441,
"step": 9280
},
{
"epoch": 2.3747443762781186,
"grad_norm": 7.320766925811768,
"learning_rate": 6.295818924847761e-06,
"loss": 0.0736,
"step": 9290
},
{
"epoch": 2.3773006134969323,
"grad_norm": 5.4987688064575195,
"learning_rate": 6.2871987818138626e-06,
"loss": 0.0694,
"step": 9300
},
{
"epoch": 2.3798568507157465,
"grad_norm": 7.312312602996826,
"learning_rate": 6.2785745393588815e-06,
"loss": 0.0698,
"step": 9310
},
{
"epoch": 2.38241308793456,
"grad_norm": 8.894052505493164,
"learning_rate": 6.2699462249489715e-06,
"loss": 0.0651,
"step": 9320
},
{
"epoch": 2.3849693251533743,
"grad_norm": 0.4445403516292572,
"learning_rate": 6.261313866063257e-06,
"loss": 0.0271,
"step": 9330
},
{
"epoch": 2.387525562372188,
"grad_norm": 3.842348575592041,
"learning_rate": 6.252677490193739e-06,
"loss": 0.0625,
"step": 9340
},
{
"epoch": 2.390081799591002,
"grad_norm": 3.183258295059204,
"learning_rate": 6.244037124845217e-06,
"loss": 0.0454,
"step": 9350
},
{
"epoch": 2.392638036809816,
"grad_norm": 3.39320969581604,
"learning_rate": 6.235392797535193e-06,
"loss": 0.0615,
"step": 9360
},
{
"epoch": 2.39519427402863,
"grad_norm": 10.20765495300293,
"learning_rate": 6.226744535793788e-06,
"loss": 0.0808,
"step": 9370
},
{
"epoch": 2.3977505112474438,
"grad_norm": 0.8380181789398193,
"learning_rate": 6.2180923671636524e-06,
"loss": 0.0485,
"step": 9380
},
{
"epoch": 2.4003067484662575,
"grad_norm": 0.444444477558136,
"learning_rate": 6.20943631919988e-06,
"loss": 0.02,
"step": 9390
},
{
"epoch": 2.4028629856850716,
"grad_norm": 8.41584587097168,
"learning_rate": 6.200776419469918e-06,
"loss": 0.054,
"step": 9400
},
{
"epoch": 2.4054192229038853,
"grad_norm": 5.808600425720215,
"learning_rate": 6.192112695553483e-06,
"loss": 0.0671,
"step": 9410
},
{
"epoch": 2.4079754601226995,
"grad_norm": 3.8908348083496094,
"learning_rate": 6.183445175042466e-06,
"loss": 0.0618,
"step": 9420
},
{
"epoch": 2.410531697341513,
"grad_norm": 4.925373077392578,
"learning_rate": 6.174773885540855e-06,
"loss": 0.0512,
"step": 9430
},
{
"epoch": 2.4130879345603273,
"grad_norm": 0.03155489265918732,
"learning_rate": 6.166098854664638e-06,
"loss": 0.0356,
"step": 9440
},
{
"epoch": 2.415644171779141,
"grad_norm": 0.08001308143138885,
"learning_rate": 6.157420110041719e-06,
"loss": 0.031,
"step": 9450
},
{
"epoch": 2.418200408997955,
"grad_norm": 6.34970760345459,
"learning_rate": 6.1487376793118285e-06,
"loss": 0.0595,
"step": 9460
},
{
"epoch": 2.420756646216769,
"grad_norm": 15.385396957397461,
"learning_rate": 6.140051590126439e-06,
"loss": 0.0452,
"step": 9470
},
{
"epoch": 2.4233128834355826,
"grad_norm": 5.20993185043335,
"learning_rate": 6.131361870148672e-06,
"loss": 0.0745,
"step": 9480
},
{
"epoch": 2.4258691206543967,
"grad_norm": 4.343068599700928,
"learning_rate": 6.1226685470532125e-06,
"loss": 0.0639,
"step": 9490
},
{
"epoch": 2.4284253578732105,
"grad_norm": 4.774913787841797,
"learning_rate": 6.113971648526222e-06,
"loss": 0.0416,
"step": 9500
},
{
"epoch": 2.4309815950920246,
"grad_norm": 0.5611134767532349,
"learning_rate": 6.105271202265246e-06,
"loss": 0.0636,
"step": 9510
},
{
"epoch": 2.4335378323108383,
"grad_norm": 6.5504279136657715,
"learning_rate": 6.096567235979133e-06,
"loss": 0.0537,
"step": 9520
},
{
"epoch": 2.4360940695296525,
"grad_norm": 0.9646693468093872,
"learning_rate": 6.0878597773879376e-06,
"loss": 0.0512,
"step": 9530
},
{
"epoch": 2.438650306748466,
"grad_norm": 4.056527614593506,
"learning_rate": 6.079148854222839e-06,
"loss": 0.0451,
"step": 9540
},
{
"epoch": 2.4412065439672803,
"grad_norm": 5.754093170166016,
"learning_rate": 6.07043449422605e-06,
"loss": 0.0635,
"step": 9550
},
{
"epoch": 2.443762781186094,
"grad_norm": 7.742176532745361,
"learning_rate": 6.061716725150727e-06,
"loss": 0.0305,
"step": 9560
},
{
"epoch": 2.4463190184049077,
"grad_norm": 7.218969345092773,
"learning_rate": 6.052995574760887e-06,
"loss": 0.0615,
"step": 9570
},
{
"epoch": 2.448875255623722,
"grad_norm": 3.2471063137054443,
"learning_rate": 6.044271070831312e-06,
"loss": 0.0568,
"step": 9580
},
{
"epoch": 2.451431492842536,
"grad_norm": 0.23956027626991272,
"learning_rate": 6.035543241147469e-06,
"loss": 0.0468,
"step": 9590
},
{
"epoch": 2.4539877300613497,
"grad_norm": 0.8415837287902832,
"learning_rate": 6.026812113505409e-06,
"loss": 0.0366,
"step": 9600
},
{
"epoch": 2.4565439672801634,
"grad_norm": 4.5563154220581055,
"learning_rate": 6.018077715711695e-06,
"loss": 0.0611,
"step": 9610
},
{
"epoch": 2.4591002044989776,
"grad_norm": 9.952324867248535,
"learning_rate": 6.009340075583299e-06,
"loss": 0.0504,
"step": 9620
},
{
"epoch": 2.4616564417177913,
"grad_norm": 3.28727126121521,
"learning_rate": 6.00059922094752e-06,
"loss": 0.0563,
"step": 9630
},
{
"epoch": 2.4642126789366054,
"grad_norm": 4.564260959625244,
"learning_rate": 5.991855179641896e-06,
"loss": 0.0354,
"step": 9640
},
{
"epoch": 2.466768916155419,
"grad_norm": 5.473964214324951,
"learning_rate": 5.983107979514112e-06,
"loss": 0.0389,
"step": 9650
},
{
"epoch": 2.4693251533742333,
"grad_norm": 3.674219846725464,
"learning_rate": 5.974357648421916e-06,
"loss": 0.0745,
"step": 9660
},
{
"epoch": 2.471881390593047,
"grad_norm": 0.6603105068206787,
"learning_rate": 5.965604214233022e-06,
"loss": 0.0572,
"step": 9670
},
{
"epoch": 2.474437627811861,
"grad_norm": 4.627801895141602,
"learning_rate": 5.956847704825033e-06,
"loss": 0.0395,
"step": 9680
},
{
"epoch": 2.476993865030675,
"grad_norm": 2.601986885070801,
"learning_rate": 5.94808814808534e-06,
"loss": 0.0775,
"step": 9690
},
{
"epoch": 2.4795501022494886,
"grad_norm": 5.2239460945129395,
"learning_rate": 5.9393255719110455e-06,
"loss": 0.057,
"step": 9700
},
{
"epoch": 2.4821063394683027,
"grad_norm": 0.7189023494720459,
"learning_rate": 5.9305600042088595e-06,
"loss": 0.0669,
"step": 9710
},
{
"epoch": 2.4846625766871164,
"grad_norm": 4.483514308929443,
"learning_rate": 5.9217914728950286e-06,
"loss": 0.0511,
"step": 9720
},
{
"epoch": 2.4872188139059306,
"grad_norm": 1.7777493000030518,
"learning_rate": 5.913020005895232e-06,
"loss": 0.0491,
"step": 9730
},
{
"epoch": 2.4897750511247443,
"grad_norm": 6.096456050872803,
"learning_rate": 5.904245631144498e-06,
"loss": 0.0772,
"step": 9740
},
{
"epoch": 2.4923312883435584,
"grad_norm": 6.093538761138916,
"learning_rate": 5.895468376587121e-06,
"loss": 0.0738,
"step": 9750
},
{
"epoch": 2.494887525562372,
"grad_norm": 0.5627290606498718,
"learning_rate": 5.8866882701765605e-06,
"loss": 0.0428,
"step": 9760
},
{
"epoch": 2.4974437627811863,
"grad_norm": 2.499333143234253,
"learning_rate": 5.877905339875363e-06,
"loss": 0.0465,
"step": 9770
},
{
"epoch": 2.5,
"grad_norm": 2.476902723312378,
"learning_rate": 5.869119613655062e-06,
"loss": 0.033,
"step": 9780
},
{
"epoch": 2.5025562372188137,
"grad_norm": 3.85345458984375,
"learning_rate": 5.860331119496106e-06,
"loss": 0.0589,
"step": 9790
},
{
"epoch": 2.505112474437628,
"grad_norm": 0.05737360939383507,
"learning_rate": 5.851539885387748e-06,
"loss": 0.0693,
"step": 9800
},
{
"epoch": 2.5076687116564416,
"grad_norm": 5.509619235992432,
"learning_rate": 5.8427459393279736e-06,
"loss": 0.0514,
"step": 9810
},
{
"epoch": 2.5102249488752557,
"grad_norm": 5.018087863922119,
"learning_rate": 5.8339493093234025e-06,
"loss": 0.0638,
"step": 9820
},
{
"epoch": 2.5127811860940694,
"grad_norm": 5.845489501953125,
"learning_rate": 5.825150023389203e-06,
"loss": 0.0408,
"step": 9830
},
{
"epoch": 2.5153374233128836,
"grad_norm": 3.2593860626220703,
"learning_rate": 5.816348109549005e-06,
"loss": 0.0141,
"step": 9840
},
{
"epoch": 2.5178936605316973,
"grad_norm": 5.271510124206543,
"learning_rate": 5.807543595834799e-06,
"loss": 0.0526,
"step": 9850
},
{
"epoch": 2.5204498977505114,
"grad_norm": 0.1252453476190567,
"learning_rate": 5.798736510286866e-06,
"loss": 0.0522,
"step": 9860
},
{
"epoch": 2.523006134969325,
"grad_norm": 4.33157205581665,
"learning_rate": 5.7899268809536705e-06,
"loss": 0.0888,
"step": 9870
},
{
"epoch": 2.525562372188139,
"grad_norm": 6.989223480224609,
"learning_rate": 5.781114735891781e-06,
"loss": 0.0413,
"step": 9880
},
{
"epoch": 2.528118609406953,
"grad_norm": 4.28364896774292,
"learning_rate": 5.772300103165777e-06,
"loss": 0.0438,
"step": 9890
},
{
"epoch": 2.530674846625767,
"grad_norm": 0.42973408102989197,
"learning_rate": 5.763483010848161e-06,
"loss": 0.0537,
"step": 9900
},
{
"epoch": 2.533231083844581,
"grad_norm": 3.0371270179748535,
"learning_rate": 5.7546634870192695e-06,
"loss": 0.0482,
"step": 9910
},
{
"epoch": 2.5357873210633946,
"grad_norm": 10.468814849853516,
"learning_rate": 5.745841559767182e-06,
"loss": 0.0593,
"step": 9920
},
{
"epoch": 2.5383435582822087,
"grad_norm": 1.1547623872756958,
"learning_rate": 5.737017257187634e-06,
"loss": 0.0457,
"step": 9930
},
{
"epoch": 2.5408997955010224,
"grad_norm": 5.563620567321777,
"learning_rate": 5.728190607383921e-06,
"loss": 0.0876,
"step": 9940
},
{
"epoch": 2.5434560327198366,
"grad_norm": 1.5379348993301392,
"learning_rate": 5.719361638466819e-06,
"loss": 0.0441,
"step": 9950
},
{
"epoch": 2.5460122699386503,
"grad_norm": 4.261902809143066,
"learning_rate": 5.7105303785544894e-06,
"loss": 0.0243,
"step": 9960
},
{
"epoch": 2.548568507157464,
"grad_norm": 5.261180400848389,
"learning_rate": 5.7016968557723874e-06,
"loss": 0.0309,
"step": 9970
},
{
"epoch": 2.551124744376278,
"grad_norm": 2.3447492122650146,
"learning_rate": 5.692861098253174e-06,
"loss": 0.0348,
"step": 9980
},
{
"epoch": 2.5536809815950923,
"grad_norm": 2.6790072917938232,
"learning_rate": 5.684023134136634e-06,
"loss": 0.0353,
"step": 9990
},
{
"epoch": 2.556237218813906,
"grad_norm": 3.520054817199707,
"learning_rate": 5.67518299156957e-06,
"loss": 0.0852,
"step": 10000
},
{
"epoch": 2.5587934560327197,
"grad_norm": 4.114648342132568,
"learning_rate": 5.66634069870573e-06,
"loss": 0.0708,
"step": 10010
},
{
"epoch": 2.561349693251534,
"grad_norm": 1.8037816286087036,
"learning_rate": 5.657496283705708e-06,
"loss": 0.0496,
"step": 10020
},
{
"epoch": 2.5639059304703475,
"grad_norm": 4.1163716316223145,
"learning_rate": 5.648649774736855e-06,
"loss": 0.0555,
"step": 10030
},
{
"epoch": 2.5664621676891617,
"grad_norm": 3.350024700164795,
"learning_rate": 5.639801199973191e-06,
"loss": 0.0262,
"step": 10040
},
{
"epoch": 2.5690184049079754,
"grad_norm": 7.735722541809082,
"learning_rate": 5.630950587595319e-06,
"loss": 0.0463,
"step": 10050
},
{
"epoch": 2.571574642126789,
"grad_norm": 4.3258538246154785,
"learning_rate": 5.622097965790325e-06,
"loss": 0.0553,
"step": 10060
},
{
"epoch": 2.5741308793456033,
"grad_norm": 4.328603744506836,
"learning_rate": 5.6132433627517005e-06,
"loss": 0.0632,
"step": 10070
},
{
"epoch": 2.5766871165644174,
"grad_norm": 4.42746114730835,
"learning_rate": 5.6043868066792415e-06,
"loss": 0.0503,
"step": 10080
},
{
"epoch": 2.579243353783231,
"grad_norm": 3.8148934841156006,
"learning_rate": 5.595528325778968e-06,
"loss": 0.0607,
"step": 10090
},
{
"epoch": 2.581799591002045,
"grad_norm": 3.635321617126465,
"learning_rate": 5.58666794826303e-06,
"loss": 0.0448,
"step": 10100
},
{
"epoch": 2.584355828220859,
"grad_norm": 0.01259413082152605,
"learning_rate": 5.577805702349614e-06,
"loss": 0.0408,
"step": 10110
},
{
"epoch": 2.5869120654396727,
"grad_norm": 3.7854018211364746,
"learning_rate": 5.568941616262861e-06,
"loss": 0.0585,
"step": 10120
},
{
"epoch": 2.589468302658487,
"grad_norm": 1.9844086170196533,
"learning_rate": 5.5600757182327695e-06,
"loss": 0.0263,
"step": 10130
},
{
"epoch": 2.5920245398773005,
"grad_norm": 5.477379322052002,
"learning_rate": 5.5512080364951105e-06,
"loss": 0.0553,
"step": 10140
},
{
"epoch": 2.5945807770961147,
"grad_norm": 6.039186954498291,
"learning_rate": 5.542338599291335e-06,
"loss": 0.0379,
"step": 10150
},
{
"epoch": 2.5971370143149284,
"grad_norm": 0.8108224272727966,
"learning_rate": 5.533467434868486e-06,
"loss": 0.0534,
"step": 10160
},
{
"epoch": 2.5996932515337425,
"grad_norm": 4.296036243438721,
"learning_rate": 5.524594571479104e-06,
"loss": 0.036,
"step": 10170
},
{
"epoch": 2.6022494887525562,
"grad_norm": 3.003478765487671,
"learning_rate": 5.515720037381144e-06,
"loss": 0.0471,
"step": 10180
},
{
"epoch": 2.60480572597137,
"grad_norm": 4.551526069641113,
"learning_rate": 5.50684386083788e-06,
"loss": 0.0675,
"step": 10190
},
{
"epoch": 2.607361963190184,
"grad_norm": 2.661856174468994,
"learning_rate": 5.497966070117816e-06,
"loss": 0.0298,
"step": 10200
},
{
"epoch": 2.609918200408998,
"grad_norm": 3.3979713916778564,
"learning_rate": 5.4890866934946e-06,
"loss": 0.0422,
"step": 10210
},
{
"epoch": 2.612474437627812,
"grad_norm": 5.5186896324157715,
"learning_rate": 5.480205759246926e-06,
"loss": 0.0471,
"step": 10220
},
{
"epoch": 2.6150306748466257,
"grad_norm": 3.5918192863464355,
"learning_rate": 5.471323295658455e-06,
"loss": 0.0692,
"step": 10230
},
{
"epoch": 2.61758691206544,
"grad_norm": 4.837007999420166,
"learning_rate": 5.462439331017711e-06,
"loss": 0.0464,
"step": 10240
},
{
"epoch": 2.6201431492842535,
"grad_norm": 1.8546375036239624,
"learning_rate": 5.453553893618003e-06,
"loss": 0.0397,
"step": 10250
},
{
"epoch": 2.6226993865030677,
"grad_norm": 7.079483985900879,
"learning_rate": 5.44466701175733e-06,
"loss": 0.0308,
"step": 10260
},
{
"epoch": 2.6252556237218814,
"grad_norm": 0.1995091438293457,
"learning_rate": 5.435778713738292e-06,
"loss": 0.0247,
"step": 10270
},
{
"epoch": 2.627811860940695,
"grad_norm": 5.363643169403076,
"learning_rate": 5.426889027867997e-06,
"loss": 0.0418,
"step": 10280
},
{
"epoch": 2.6303680981595092,
"grad_norm": 1.1156824827194214,
"learning_rate": 5.417997982457974e-06,
"loss": 0.0631,
"step": 10290
},
{
"epoch": 2.6329243353783234,
"grad_norm": 0.43824976682662964,
"learning_rate": 5.409105605824082e-06,
"loss": 0.0433,
"step": 10300
},
{
"epoch": 2.635480572597137,
"grad_norm": 0.8822130560874939,
"learning_rate": 5.400211926286421e-06,
"loss": 0.0247,
"step": 10310
},
{
"epoch": 2.638036809815951,
"grad_norm": 3.7047805786132812,
"learning_rate": 5.391316972169236e-06,
"loss": 0.039,
"step": 10320
},
{
"epoch": 2.640593047034765,
"grad_norm": 3.4349169731140137,
"learning_rate": 5.382420771800836e-06,
"loss": 0.0148,
"step": 10330
},
{
"epoch": 2.6431492842535786,
"grad_norm": 4.191125392913818,
"learning_rate": 5.373523353513498e-06,
"loss": 0.0671,
"step": 10340
},
{
"epoch": 2.645705521472393,
"grad_norm": 8.565727233886719,
"learning_rate": 5.364624745643375e-06,
"loss": 0.0534,
"step": 10350
},
{
"epoch": 2.6482617586912065,
"grad_norm": 5.6679840087890625,
"learning_rate": 5.35572497653041e-06,
"loss": 0.0493,
"step": 10360
},
{
"epoch": 2.65081799591002,
"grad_norm": 2.3933236598968506,
"learning_rate": 5.346824074518246e-06,
"loss": 0.05,
"step": 10370
},
{
"epoch": 2.6533742331288344,
"grad_norm": 0.3358931839466095,
"learning_rate": 5.337922067954136e-06,
"loss": 0.0137,
"step": 10380
},
{
"epoch": 2.6559304703476485,
"grad_norm": 2.881453275680542,
"learning_rate": 5.329018985188841e-06,
"loss": 0.0689,
"step": 10390
},
{
"epoch": 2.658486707566462,
"grad_norm": 3.27288818359375,
"learning_rate": 5.320114854576559e-06,
"loss": 0.0297,
"step": 10400
},
{
"epoch": 2.661042944785276,
"grad_norm": 3.823456287384033,
"learning_rate": 5.3112097044748235e-06,
"loss": 0.0607,
"step": 10410
},
{
"epoch": 2.66359918200409,
"grad_norm": 3.608356475830078,
"learning_rate": 5.302303563244413e-06,
"loss": 0.0381,
"step": 10420
},
{
"epoch": 2.6661554192229038,
"grad_norm": 1.3827208280563354,
"learning_rate": 5.2933964592492614e-06,
"loss": 0.05,
"step": 10430
},
{
"epoch": 2.668711656441718,
"grad_norm": 0.04524281620979309,
"learning_rate": 5.284488420856372e-06,
"loss": 0.0268,
"step": 10440
},
{
"epoch": 2.6712678936605316,
"grad_norm": 7.237791538238525,
"learning_rate": 5.275579476435719e-06,
"loss": 0.0239,
"step": 10450
},
{
"epoch": 2.6738241308793453,
"grad_norm": 0.08604143559932709,
"learning_rate": 5.2666696543601696e-06,
"loss": 0.0819,
"step": 10460
},
{
"epoch": 2.6763803680981595,
"grad_norm": 4.662979602813721,
"learning_rate": 5.25775898300538e-06,
"loss": 0.0539,
"step": 10470
},
{
"epoch": 2.6789366053169736,
"grad_norm": 0.7715989947319031,
"learning_rate": 5.248847490749711e-06,
"loss": 0.0375,
"step": 10480
},
{
"epoch": 2.6814928425357873,
"grad_norm": 5.067183971405029,
"learning_rate": 5.239935205974145e-06,
"loss": 0.0205,
"step": 10490
},
{
"epoch": 2.684049079754601,
"grad_norm": 5.718189716339111,
"learning_rate": 5.231022157062177e-06,
"loss": 0.0898,
"step": 10500
},
{
"epoch": 2.686605316973415,
"grad_norm": 14.444259643554688,
"learning_rate": 5.222108372399746e-06,
"loss": 0.043,
"step": 10510
},
{
"epoch": 2.689161554192229,
"grad_norm": 0.2056499719619751,
"learning_rate": 5.213193880375127e-06,
"loss": 0.0639,
"step": 10520
},
{
"epoch": 2.691717791411043,
"grad_norm": 0.02217238023877144,
"learning_rate": 5.204278709378854e-06,
"loss": 0.0177,
"step": 10530
},
{
"epoch": 2.6942740286298568,
"grad_norm": 1.9676077365875244,
"learning_rate": 5.195362887803617e-06,
"loss": 0.0495,
"step": 10540
},
{
"epoch": 2.696830265848671,
"grad_norm": 2.4766979217529297,
"learning_rate": 5.186446444044184e-06,
"loss": 0.0572,
"step": 10550
},
{
"epoch": 2.6993865030674846,
"grad_norm": 0.9265226721763611,
"learning_rate": 5.177529406497298e-06,
"loss": 0.0192,
"step": 10560
},
{
"epoch": 2.7019427402862988,
"grad_norm": 6.686746597290039,
"learning_rate": 5.168611803561599e-06,
"loss": 0.0632,
"step": 10570
},
{
"epoch": 2.7044989775051125,
"grad_norm": 4.72622013092041,
"learning_rate": 5.159693663637525e-06,
"loss": 0.0499,
"step": 10580
},
{
"epoch": 2.707055214723926,
"grad_norm": 4.173243045806885,
"learning_rate": 5.150775015127224e-06,
"loss": 0.0343,
"step": 10590
},
{
"epoch": 2.7096114519427403,
"grad_norm": 0.10401232540607452,
"learning_rate": 5.1418558864344645e-06,
"loss": 0.0417,
"step": 10600
},
{
"epoch": 2.712167689161554,
"grad_norm": 4.092282772064209,
"learning_rate": 5.132936305964543e-06,
"loss": 0.0335,
"step": 10610
},
{
"epoch": 2.714723926380368,
"grad_norm": 8.394328117370605,
"learning_rate": 5.1240163021241975e-06,
"loss": 0.0785,
"step": 10620
},
{
"epoch": 2.717280163599182,
"grad_norm": 3.676940441131592,
"learning_rate": 5.1150959033215104e-06,
"loss": 0.0382,
"step": 10630
},
{
"epoch": 2.719836400817996,
"grad_norm": 0.23662449419498444,
"learning_rate": 5.106175137965826e-06,
"loss": 0.0467,
"step": 10640
},
{
"epoch": 2.7223926380368098,
"grad_norm": 6.808079719543457,
"learning_rate": 5.097254034467652e-06,
"loss": 0.0348,
"step": 10650
},
{
"epoch": 2.724948875255624,
"grad_norm": 0.04969576373696327,
"learning_rate": 5.0883326212385775e-06,
"loss": 0.031,
"step": 10660
},
{
"epoch": 2.7275051124744376,
"grad_norm": 6.316954612731934,
"learning_rate": 5.079410926691174e-06,
"loss": 0.053,
"step": 10670
},
{
"epoch": 2.7300613496932513,
"grad_norm": 4.699779987335205,
"learning_rate": 5.07048897923891e-06,
"loss": 0.0328,
"step": 10680
},
{
"epoch": 2.7326175869120655,
"grad_norm": 2.899876117706299,
"learning_rate": 5.061566807296062e-06,
"loss": 0.0537,
"step": 10690
},
{
"epoch": 2.7351738241308796,
"grad_norm": 1.7334074974060059,
"learning_rate": 5.052644439277617e-06,
"loss": 0.036,
"step": 10700
},
{
"epoch": 2.7377300613496933,
"grad_norm": 0.5449509024620056,
"learning_rate": 5.043721903599193e-06,
"loss": 0.0199,
"step": 10710
},
{
"epoch": 2.740286298568507,
"grad_norm": 0.7619210481643677,
"learning_rate": 5.0347992286769324e-06,
"loss": 0.0349,
"step": 10720
},
{
"epoch": 2.742842535787321,
"grad_norm": 0.09413593262434006,
"learning_rate": 5.025876442927429e-06,
"loss": 0.0579,
"step": 10730
},
{
"epoch": 2.745398773006135,
"grad_norm": 2.7584242820739746,
"learning_rate": 5.016953574767629e-06,
"loss": 0.0824,
"step": 10740
},
{
"epoch": 2.747955010224949,
"grad_norm": 3.956817626953125,
"learning_rate": 5.008030652614737e-06,
"loss": 0.0461,
"step": 10750
},
{
"epoch": 2.7505112474437627,
"grad_norm": 0.14918692409992218,
"learning_rate": 4.99910770488613e-06,
"loss": 0.0116,
"step": 10760
},
{
"epoch": 2.7530674846625764,
"grad_norm": 4.674230098724365,
"learning_rate": 4.990184759999271e-06,
"loss": 0.0704,
"step": 10770
},
{
"epoch": 2.7556237218813906,
"grad_norm": 4.550516128540039,
"learning_rate": 4.981261846371612e-06,
"loss": 0.0328,
"step": 10780
},
{
"epoch": 2.7581799591002047,
"grad_norm": 5.67306661605835,
"learning_rate": 4.972338992420501e-06,
"loss": 0.0425,
"step": 10790
},
{
"epoch": 2.7607361963190185,
"grad_norm": 3.2620246410369873,
"learning_rate": 4.9634162265631016e-06,
"loss": 0.0281,
"step": 10800
},
{
"epoch": 2.763292433537832,
"grad_norm": 5.77325963973999,
"learning_rate": 4.954493577216294e-06,
"loss": 0.0263,
"step": 10810
},
{
"epoch": 2.7658486707566463,
"grad_norm": 7.105217933654785,
"learning_rate": 4.9455710727965886e-06,
"loss": 0.0971,
"step": 10820
},
{
"epoch": 2.76840490797546,
"grad_norm": 8.464949607849121,
"learning_rate": 4.936648741720032e-06,
"loss": 0.0459,
"step": 10830
},
{
"epoch": 2.770961145194274,
"grad_norm": 9.054972648620605,
"learning_rate": 4.9277266124021245e-06,
"loss": 0.0335,
"step": 10840
},
{
"epoch": 2.773517382413088,
"grad_norm": 1.2454347610473633,
"learning_rate": 4.918804713257715e-06,
"loss": 0.0471,
"step": 10850
},
{
"epoch": 2.7760736196319016,
"grad_norm": 2.4472923278808594,
"learning_rate": 4.909883072700928e-06,
"loss": 0.0462,
"step": 10860
},
{
"epoch": 2.7786298568507157,
"grad_norm": 0.04563615098595619,
"learning_rate": 4.900961719145056e-06,
"loss": 0.0167,
"step": 10870
},
{
"epoch": 2.78118609406953,
"grad_norm": 5.5846734046936035,
"learning_rate": 4.892040681002488e-06,
"loss": 0.0578,
"step": 10880
},
{
"epoch": 2.7837423312883436,
"grad_norm": 4.339868068695068,
"learning_rate": 4.883119986684596e-06,
"loss": 0.0273,
"step": 10890
},
{
"epoch": 2.7862985685071573,
"grad_norm": 4.785184383392334,
"learning_rate": 4.87419966460167e-06,
"loss": 0.076,
"step": 10900
},
{
"epoch": 2.7888548057259714,
"grad_norm": 0.035292405635118484,
"learning_rate": 4.865279743162804e-06,
"loss": 0.0462,
"step": 10910
},
{
"epoch": 2.791411042944785,
"grad_norm": 3.155709743499756,
"learning_rate": 4.856360250775821e-06,
"loss": 0.036,
"step": 10920
},
{
"epoch": 2.7939672801635993,
"grad_norm": 0.7432450652122498,
"learning_rate": 4.847441215847177e-06,
"loss": 0.0619,
"step": 10930
},
{
"epoch": 2.796523517382413,
"grad_norm": 5.327913761138916,
"learning_rate": 4.838522666781871e-06,
"loss": 0.0647,
"step": 10940
},
{
"epoch": 2.799079754601227,
"grad_norm": 2.4953877925872803,
"learning_rate": 4.829604631983353e-06,
"loss": 0.0392,
"step": 10950
},
{
"epoch": 2.801635991820041,
"grad_norm": 3.6846439838409424,
"learning_rate": 4.8206871398534385e-06,
"loss": 0.0368,
"step": 10960
},
{
"epoch": 2.804192229038855,
"grad_norm": 5.844122886657715,
"learning_rate": 4.811770218792212e-06,
"loss": 0.0476,
"step": 10970
},
{
"epoch": 2.8067484662576687,
"grad_norm": 4.004204273223877,
"learning_rate": 4.80285389719794e-06,
"loss": 0.0589,
"step": 10980
},
{
"epoch": 2.8093047034764824,
"grad_norm": 0.9968608021736145,
"learning_rate": 4.793938203466979e-06,
"loss": 0.0448,
"step": 10990
},
{
"epoch": 2.8118609406952966,
"grad_norm": 6.936352252960205,
"learning_rate": 4.78502316599369e-06,
"loss": 0.0447,
"step": 11000
},
{
"epoch": 2.8144171779141103,
"grad_norm": 4.1466383934021,
"learning_rate": 4.776108813170337e-06,
"loss": 0.0406,
"step": 11010
},
{
"epoch": 2.8169734151329244,
"grad_norm": 12.088165283203125,
"learning_rate": 4.76719517338701e-06,
"loss": 0.0544,
"step": 11020
},
{
"epoch": 2.819529652351738,
"grad_norm": 3.7247049808502197,
"learning_rate": 4.758282275031524e-06,
"loss": 0.0304,
"step": 11030
},
{
"epoch": 2.8220858895705523,
"grad_norm": 5.583109378814697,
"learning_rate": 4.7493701464893366e-06,
"loss": 0.0326,
"step": 11040
},
{
"epoch": 2.824642126789366,
"grad_norm": 1.8860771656036377,
"learning_rate": 4.740458816143447e-06,
"loss": 0.0268,
"step": 11050
},
{
"epoch": 2.82719836400818,
"grad_norm": 2.164116144180298,
"learning_rate": 4.731548312374323e-06,
"loss": 0.0403,
"step": 11060
},
{
"epoch": 2.829754601226994,
"grad_norm": 3.961606740951538,
"learning_rate": 4.722638663559787e-06,
"loss": 0.039,
"step": 11070
},
{
"epoch": 2.8323108384458076,
"grad_norm": 0.07476239651441574,
"learning_rate": 4.713729898074949e-06,
"loss": 0.0522,
"step": 11080
},
{
"epoch": 2.8348670756646217,
"grad_norm": 4.681721210479736,
"learning_rate": 4.704822044292103e-06,
"loss": 0.0413,
"step": 11090
},
{
"epoch": 2.837423312883436,
"grad_norm": 4.108366012573242,
"learning_rate": 4.695915130580636e-06,
"loss": 0.0305,
"step": 11100
},
{
"epoch": 2.8399795501022496,
"grad_norm": 0.2699336111545563,
"learning_rate": 4.687009185306945e-06,
"loss": 0.0495,
"step": 11110
},
{
"epoch": 2.8425357873210633,
"grad_norm": 3.466141939163208,
"learning_rate": 4.678104236834341e-06,
"loss": 0.0725,
"step": 11120
},
{
"epoch": 2.8450920245398774,
"grad_norm": 3.030548334121704,
"learning_rate": 4.6692003135229606e-06,
"loss": 0.0405,
"step": 11130
},
{
"epoch": 2.847648261758691,
"grad_norm": 4.3781938552856445,
"learning_rate": 4.660297443729675e-06,
"loss": 0.0209,
"step": 11140
},
{
"epoch": 2.8502044989775053,
"grad_norm": 0.2208949774503708,
"learning_rate": 4.6513956558080034e-06,
"loss": 0.0237,
"step": 11150
},
{
"epoch": 2.852760736196319,
"grad_norm": 4.45728874206543,
"learning_rate": 4.642494978108014e-06,
"loss": 0.0528,
"step": 11160
},
{
"epoch": 2.8553169734151327,
"grad_norm": 6.202856063842773,
"learning_rate": 4.633595438976244e-06,
"loss": 0.0534,
"step": 11170
},
{
"epoch": 2.857873210633947,
"grad_norm": 3.93393874168396,
"learning_rate": 4.624697066755602e-06,
"loss": 0.0261,
"step": 11180
},
{
"epoch": 2.860429447852761,
"grad_norm": 1.9619215726852417,
"learning_rate": 4.6157998897852815e-06,
"loss": 0.0429,
"step": 11190
},
{
"epoch": 2.8629856850715747,
"grad_norm": 5.04984188079834,
"learning_rate": 4.606903936400667e-06,
"loss": 0.0428,
"step": 11200
},
{
"epoch": 2.8655419222903884,
"grad_norm": 3.097203254699707,
"learning_rate": 4.5980092349332525e-06,
"loss": 0.0336,
"step": 11210
},
{
"epoch": 2.8680981595092025,
"grad_norm": 1.7928495407104492,
"learning_rate": 4.589115813710535e-06,
"loss": 0.0516,
"step": 11220
},
{
"epoch": 2.8706543967280163,
"grad_norm": 3.5692665576934814,
"learning_rate": 4.580223701055945e-06,
"loss": 0.0328,
"step": 11230
},
{
"epoch": 2.8732106339468304,
"grad_norm": 1.9397566318511963,
"learning_rate": 4.571332925288735e-06,
"loss": 0.0255,
"step": 11240
},
{
"epoch": 2.875766871165644,
"grad_norm": 3.0860631465911865,
"learning_rate": 4.562443514723911e-06,
"loss": 0.0356,
"step": 11250
},
{
"epoch": 2.878323108384458,
"grad_norm": 3.6334643363952637,
"learning_rate": 4.553555497672119e-06,
"loss": 0.0535,
"step": 11260
},
{
"epoch": 2.880879345603272,
"grad_norm": 5.285019397735596,
"learning_rate": 4.544668902439577e-06,
"loss": 0.073,
"step": 11270
},
{
"epoch": 2.883435582822086,
"grad_norm": 0.21129778027534485,
"learning_rate": 4.53578375732797e-06,
"loss": 0.0175,
"step": 11280
},
{
"epoch": 2.8859918200409,
"grad_norm": 0.07329968363046646,
"learning_rate": 4.526900090634368e-06,
"loss": 0.0222,
"step": 11290
},
{
"epoch": 2.8885480572597135,
"grad_norm": 2.5236427783966064,
"learning_rate": 4.518017930651128e-06,
"loss": 0.0439,
"step": 11300
},
{
"epoch": 2.8911042944785277,
"grad_norm": 0.4075072407722473,
"learning_rate": 4.509137305665812e-06,
"loss": 0.0405,
"step": 11310
},
{
"epoch": 2.8936605316973414,
"grad_norm": 1.6199369430541992,
"learning_rate": 4.5002582439610895e-06,
"loss": 0.019,
"step": 11320
},
{
"epoch": 2.8962167689161555,
"grad_norm": 0.04643448814749718,
"learning_rate": 4.491380773814659e-06,
"loss": 0.0212,
"step": 11330
},
{
"epoch": 2.8987730061349692,
"grad_norm": 1.4235713481903076,
"learning_rate": 4.4825049234991405e-06,
"loss": 0.0105,
"step": 11340
},
{
"epoch": 2.9013292433537834,
"grad_norm": 0.04633248969912529,
"learning_rate": 4.473630721282004e-06,
"loss": 0.0261,
"step": 11350
},
{
"epoch": 2.903885480572597,
"grad_norm": 5.469078063964844,
"learning_rate": 4.464758195425464e-06,
"loss": 0.0275,
"step": 11360
},
{
"epoch": 2.9064417177914113,
"grad_norm": 0.15273931622505188,
"learning_rate": 4.455887374186401e-06,
"loss": 0.0297,
"step": 11370
},
{
"epoch": 2.908997955010225,
"grad_norm": 0.10551747679710388,
"learning_rate": 4.447018285816263e-06,
"loss": 0.0285,
"step": 11380
},
{
"epoch": 2.9115541922290387,
"grad_norm": 0.063129723072052,
"learning_rate": 4.438150958560983e-06,
"loss": 0.028,
"step": 11390
},
{
"epoch": 2.914110429447853,
"grad_norm": 0.8330835700035095,
"learning_rate": 4.42928542066088e-06,
"loss": 0.0227,
"step": 11400
},
{
"epoch": 2.9166666666666665,
"grad_norm": 5.185162544250488,
"learning_rate": 4.420421700350581e-06,
"loss": 0.0378,
"step": 11410
},
{
"epoch": 2.9192229038854807,
"grad_norm": 0.602056622505188,
"learning_rate": 4.4115598258589165e-06,
"loss": 0.0259,
"step": 11420
},
{
"epoch": 2.9217791411042944,
"grad_norm": 3.8201723098754883,
"learning_rate": 4.402699825408849e-06,
"loss": 0.0373,
"step": 11430
},
{
"epoch": 2.9243353783231085,
"grad_norm": 0.2384403496980667,
"learning_rate": 4.393841727217361e-06,
"loss": 0.0158,
"step": 11440
},
{
"epoch": 2.9268916155419222,
"grad_norm": 2.9862217903137207,
"learning_rate": 4.384985559495387e-06,
"loss": 0.0573,
"step": 11450
},
{
"epoch": 2.9294478527607364,
"grad_norm": 5.518589019775391,
"learning_rate": 4.376131350447703e-06,
"loss": 0.0331,
"step": 11460
},
{
"epoch": 2.93200408997955,
"grad_norm": 6.048367500305176,
"learning_rate": 4.36727912827286e-06,
"loss": 0.0422,
"step": 11470
},
{
"epoch": 2.934560327198364,
"grad_norm": 5.123732089996338,
"learning_rate": 4.358428921163066e-06,
"loss": 0.0287,
"step": 11480
},
{
"epoch": 2.937116564417178,
"grad_norm": 4.53354549407959,
"learning_rate": 4.349580757304127e-06,
"loss": 0.0191,
"step": 11490
},
{
"epoch": 2.939672801635992,
"grad_norm": 1.6047019958496094,
"learning_rate": 4.34073466487533e-06,
"loss": 0.0529,
"step": 11500
},
{
"epoch": 2.942229038854806,
"grad_norm": 0.1400771290063858,
"learning_rate": 4.331890672049371e-06,
"loss": 0.029,
"step": 11510
},
{
"epoch": 2.9447852760736195,
"grad_norm": 4.497285842895508,
"learning_rate": 4.323048806992257e-06,
"loss": 0.031,
"step": 11520
},
{
"epoch": 2.9473415132924337,
"grad_norm": 5.1836442947387695,
"learning_rate": 4.31420909786322e-06,
"loss": 0.0347,
"step": 11530
},
{
"epoch": 2.9498977505112474,
"grad_norm": 0.12893950939178467,
"learning_rate": 4.305371572814623e-06,
"loss": 0.0141,
"step": 11540
},
{
"epoch": 2.9524539877300615,
"grad_norm": 5.480885028839111,
"learning_rate": 4.296536259991876e-06,
"loss": 0.0223,
"step": 11550
},
{
"epoch": 2.955010224948875,
"grad_norm": 15.032180786132812,
"learning_rate": 4.287703187533346e-06,
"loss": 0.0722,
"step": 11560
},
{
"epoch": 2.957566462167689,
"grad_norm": 2.98856520652771,
"learning_rate": 4.278872383570256e-06,
"loss": 0.0248,
"step": 11570
},
{
"epoch": 2.960122699386503,
"grad_norm": 3.5357167720794678,
"learning_rate": 4.270043876226616e-06,
"loss": 0.0385,
"step": 11580
},
{
"epoch": 2.9626789366053172,
"grad_norm": 1.0948529243469238,
"learning_rate": 4.2612176936191104e-06,
"loss": 0.0293,
"step": 11590
},
{
"epoch": 2.965235173824131,
"grad_norm": 1.0036929845809937,
"learning_rate": 4.252393863857033e-06,
"loss": 0.0598,
"step": 11600
},
{
"epoch": 2.9677914110429446,
"grad_norm": 5.068575382232666,
"learning_rate": 4.243572415042168e-06,
"loss": 0.0479,
"step": 11610
},
{
"epoch": 2.970347648261759,
"grad_norm": 2.0871167182922363,
"learning_rate": 4.2347533752687335e-06,
"loss": 0.0228,
"step": 11620
},
{
"epoch": 2.9729038854805725,
"grad_norm": 0.04474279657006264,
"learning_rate": 4.225936772623262e-06,
"loss": 0.0119,
"step": 11630
},
{
"epoch": 2.9754601226993866,
"grad_norm": 3.878139019012451,
"learning_rate": 4.217122635184532e-06,
"loss": 0.0333,
"step": 11640
},
{
"epoch": 2.9780163599182004,
"grad_norm": 0.04483529180288315,
"learning_rate": 4.208310991023469e-06,
"loss": 0.0411,
"step": 11650
},
{
"epoch": 2.980572597137014,
"grad_norm": 0.06956873834133148,
"learning_rate": 4.199501868203059e-06,
"loss": 0.015,
"step": 11660
},
{
"epoch": 2.983128834355828,
"grad_norm": 4.716834545135498,
"learning_rate": 4.190695294778254e-06,
"loss": 0.0272,
"step": 11670
},
{
"epoch": 2.9856850715746424,
"grad_norm": 4.978919506072998,
"learning_rate": 4.1818912987958935e-06,
"loss": 0.0349,
"step": 11680
},
{
"epoch": 2.988241308793456,
"grad_norm": 4.98551607131958,
"learning_rate": 4.1730899082946e-06,
"loss": 0.0391,
"step": 11690
},
{
"epoch": 2.9907975460122698,
"grad_norm": 0.028066415339708328,
"learning_rate": 4.164291151304707e-06,
"loss": 0.0366,
"step": 11700
},
{
"epoch": 2.993353783231084,
"grad_norm": 3.7603607177734375,
"learning_rate": 4.155495055848154e-06,
"loss": 0.0309,
"step": 11710
},
{
"epoch": 2.9959100204498976,
"grad_norm": 6.2368621826171875,
"learning_rate": 4.146701649938409e-06,
"loss": 0.0526,
"step": 11720
},
{
"epoch": 2.9984662576687118,
"grad_norm": 1.746232032775879,
"learning_rate": 4.13791096158037e-06,
"loss": 0.018,
"step": 11730
},
{
"epoch": 3.0010224948875255,
"grad_norm": 3.928952693939209,
"learning_rate": 4.129123018770285e-06,
"loss": 0.0108,
"step": 11740
},
{
"epoch": 3.0035787321063396,
"grad_norm": 0.7030458450317383,
"learning_rate": 4.120337849495654e-06,
"loss": 0.019,
"step": 11750
},
{
"epoch": 3.0061349693251533,
"grad_norm": 1.5258599519729614,
"learning_rate": 4.111555481735147e-06,
"loss": 0.0215,
"step": 11760
},
{
"epoch": 3.0086912065439675,
"grad_norm": 3.1201798915863037,
"learning_rate": 4.102775943458508e-06,
"loss": 0.015,
"step": 11770
},
{
"epoch": 3.011247443762781,
"grad_norm": 2.5468101501464844,
"learning_rate": 4.093999262626474e-06,
"loss": 0.0092,
"step": 11780
},
{
"epoch": 3.013803680981595,
"grad_norm": 4.258352279663086,
"learning_rate": 4.0852254671906794e-06,
"loss": 0.0111,
"step": 11790
},
{
"epoch": 3.016359918200409,
"grad_norm": 4.136040210723877,
"learning_rate": 4.076454585093572e-06,
"loss": 0.0247,
"step": 11800
},
{
"epoch": 3.0189161554192228,
"grad_norm": 0.01770654506981373,
"learning_rate": 4.067686644268316e-06,
"loss": 0.0168,
"step": 11810
},
{
"epoch": 3.021472392638037,
"grad_norm": 3.165257453918457,
"learning_rate": 4.0589216726387146e-06,
"loss": 0.0157,
"step": 11820
},
{
"epoch": 3.0240286298568506,
"grad_norm": 1.5152426958084106,
"learning_rate": 4.050159698119107e-06,
"loss": 0.0113,
"step": 11830
},
{
"epoch": 3.0265848670756648,
"grad_norm": 0.025976594537496567,
"learning_rate": 4.0414007486142985e-06,
"loss": 0.0072,
"step": 11840
},
{
"epoch": 3.0291411042944785,
"grad_norm": 4.125540256500244,
"learning_rate": 4.032644852019447e-06,
"loss": 0.0118,
"step": 11850
},
{
"epoch": 3.0316973415132926,
"grad_norm": 0.026777638122439384,
"learning_rate": 4.023892036220001e-06,
"loss": 0.001,
"step": 11860
},
{
"epoch": 3.0342535787321063,
"grad_norm": 3.001214027404785,
"learning_rate": 4.015142329091587e-06,
"loss": 0.0372,
"step": 11870
},
{
"epoch": 3.03680981595092,
"grad_norm": 0.012349724769592285,
"learning_rate": 4.006395758499937e-06,
"loss": 0.0242,
"step": 11880
},
{
"epoch": 3.039366053169734,
"grad_norm": 0.48854807019233704,
"learning_rate": 3.99765235230079e-06,
"loss": 0.0202,
"step": 11890
},
{
"epoch": 3.041922290388548,
"grad_norm": 7.029765605926514,
"learning_rate": 3.988912138339812e-06,
"loss": 0.0228,
"step": 11900
},
{
"epoch": 3.044478527607362,
"grad_norm": 2.26522159576416,
"learning_rate": 3.980175144452496e-06,
"loss": 0.0152,
"step": 11910
},
{
"epoch": 3.0470347648261757,
"grad_norm": 5.204248905181885,
"learning_rate": 3.971441398464088e-06,
"loss": 0.021,
"step": 11920
},
{
"epoch": 3.04959100204499,
"grad_norm": 2.968381881713867,
"learning_rate": 3.962710928189481e-06,
"loss": 0.0234,
"step": 11930
},
{
"epoch": 3.0521472392638036,
"grad_norm": 3.710779905319214,
"learning_rate": 3.953983761433144e-06,
"loss": 0.0067,
"step": 11940
},
{
"epoch": 3.0547034764826178,
"grad_norm": 2.136486530303955,
"learning_rate": 3.94525992598902e-06,
"loss": 0.0096,
"step": 11950
},
{
"epoch": 3.0572597137014315,
"grad_norm": 0.898169219493866,
"learning_rate": 3.936539449640445e-06,
"loss": 0.007,
"step": 11960
},
{
"epoch": 3.0598159509202456,
"grad_norm": 7.237276077270508,
"learning_rate": 3.927822360160053e-06,
"loss": 0.0261,
"step": 11970
},
{
"epoch": 3.0623721881390593,
"grad_norm": 2.5147705078125,
"learning_rate": 3.919108685309699e-06,
"loss": 0.014,
"step": 11980
},
{
"epoch": 3.064928425357873,
"grad_norm": 3.493708372116089,
"learning_rate": 3.9103984528403555e-06,
"loss": 0.0213,
"step": 11990
},
{
"epoch": 3.067484662576687,
"grad_norm": 1.2625579833984375,
"learning_rate": 3.901691690492035e-06,
"loss": 0.0161,
"step": 12000
},
{
"epoch": 3.070040899795501,
"grad_norm": 3.3386011123657227,
"learning_rate": 3.892988425993703e-06,
"loss": 0.004,
"step": 12010
},
{
"epoch": 3.072597137014315,
"grad_norm": 1.1990747451782227,
"learning_rate": 3.884288687063177e-06,
"loss": 0.0109,
"step": 12020
},
{
"epoch": 3.0751533742331287,
"grad_norm": 1.3895822763442993,
"learning_rate": 3.875592501407052e-06,
"loss": 0.0272,
"step": 12030
},
{
"epoch": 3.077709611451943,
"grad_norm": 9.504667282104492,
"learning_rate": 3.866899896720604e-06,
"loss": 0.0211,
"step": 12040
},
{
"epoch": 3.0802658486707566,
"grad_norm": 10.509309768676758,
"learning_rate": 3.858210900687707e-06,
"loss": 0.0174,
"step": 12050
},
{
"epoch": 3.0828220858895707,
"grad_norm": 0.08506203442811966,
"learning_rate": 3.849525540980739e-06,
"loss": 0.0087,
"step": 12060
},
{
"epoch": 3.0853783231083844,
"grad_norm": 1.2189379930496216,
"learning_rate": 3.840843845260501e-06,
"loss": 0.0119,
"step": 12070
},
{
"epoch": 3.087934560327198,
"grad_norm": 0.03395168483257294,
"learning_rate": 3.832165841176121e-06,
"loss": 0.0163,
"step": 12080
},
{
"epoch": 3.0904907975460123,
"grad_norm": 4.858822345733643,
"learning_rate": 3.823491556364973e-06,
"loss": 0.0104,
"step": 12090
},
{
"epoch": 3.093047034764826,
"grad_norm": 0.15337003767490387,
"learning_rate": 3.814821018452583e-06,
"loss": 0.0249,
"step": 12100
},
{
"epoch": 3.09560327198364,
"grad_norm": 6.41199254989624,
"learning_rate": 3.806154255052551e-06,
"loss": 0.0067,
"step": 12110
},
{
"epoch": 3.098159509202454,
"grad_norm": 1.0053160190582275,
"learning_rate": 3.7974912937664455e-06,
"loss": 0.0299,
"step": 12120
},
{
"epoch": 3.100715746421268,
"grad_norm": 1.6271339654922485,
"learning_rate": 3.7888321621837363e-06,
"loss": 0.0053,
"step": 12130
},
{
"epoch": 3.1032719836400817,
"grad_norm": 0.03732278570532799,
"learning_rate": 3.7801768878816892e-06,
"loss": 0.0089,
"step": 12140
},
{
"epoch": 3.105828220858896,
"grad_norm": 4.223018646240234,
"learning_rate": 3.771525498425289e-06,
"loss": 0.0107,
"step": 12150
},
{
"epoch": 3.1083844580777096,
"grad_norm": 1.2061896324157715,
"learning_rate": 3.762878021367148e-06,
"loss": 0.0154,
"step": 12160
},
{
"epoch": 3.1109406952965237,
"grad_norm": 2.464517831802368,
"learning_rate": 3.754234484247418e-06,
"loss": 0.0078,
"step": 12170
},
{
"epoch": 3.1134969325153374,
"grad_norm": 0.042976122349500656,
"learning_rate": 3.745594914593701e-06,
"loss": 0.0114,
"step": 12180
},
{
"epoch": 3.116053169734151,
"grad_norm": 0.11069530993700027,
"learning_rate": 3.7369593399209704e-06,
"loss": 0.0111,
"step": 12190
},
{
"epoch": 3.1186094069529653,
"grad_norm": 0.14891409873962402,
"learning_rate": 3.728327787731465e-06,
"loss": 0.0084,
"step": 12200
},
{
"epoch": 3.121165644171779,
"grad_norm": 0.02942030318081379,
"learning_rate": 3.7197002855146257e-06,
"loss": 0.011,
"step": 12210
},
{
"epoch": 3.123721881390593,
"grad_norm": 3.233976364135742,
"learning_rate": 3.7110768607469842e-06,
"loss": 0.0082,
"step": 12220
},
{
"epoch": 3.126278118609407,
"grad_norm": 3.62264084815979,
"learning_rate": 3.7024575408920958e-06,
"loss": 0.009,
"step": 12230
},
{
"epoch": 3.128834355828221,
"grad_norm": 0.051736973226070404,
"learning_rate": 3.693842353400435e-06,
"loss": 0.0276,
"step": 12240
},
{
"epoch": 3.1313905930470347,
"grad_norm": 1.5636509656906128,
"learning_rate": 3.6852313257093214e-06,
"loss": 0.0283,
"step": 12250
},
{
"epoch": 3.133946830265849,
"grad_norm": 3.639524221420288,
"learning_rate": 3.6766244852428218e-06,
"loss": 0.0209,
"step": 12260
},
{
"epoch": 3.1365030674846626,
"grad_norm": 2.127938985824585,
"learning_rate": 3.6680218594116725e-06,
"loss": 0.0079,
"step": 12270
},
{
"epoch": 3.1390593047034763,
"grad_norm": 5.6783447265625,
"learning_rate": 3.6594234756131826e-06,
"loss": 0.0194,
"step": 12280
},
{
"epoch": 3.1416155419222904,
"grad_norm": 0.3146345615386963,
"learning_rate": 3.6508293612311552e-06,
"loss": 0.0153,
"step": 12290
},
{
"epoch": 3.144171779141104,
"grad_norm": 0.37290289998054504,
"learning_rate": 3.642239543635793e-06,
"loss": 0.0235,
"step": 12300
},
{
"epoch": 3.1467280163599183,
"grad_norm": 0.22575929760932922,
"learning_rate": 3.6336540501836185e-06,
"loss": 0.0109,
"step": 12310
},
{
"epoch": 3.149284253578732,
"grad_norm": 3.687939405441284,
"learning_rate": 3.625072908217378e-06,
"loss": 0.0177,
"step": 12320
},
{
"epoch": 3.151840490797546,
"grad_norm": 0.08439797908067703,
"learning_rate": 3.6164961450659634e-06,
"loss": 0.0045,
"step": 12330
},
{
"epoch": 3.15439672801636,
"grad_norm": 2.362006425857544,
"learning_rate": 3.6079237880443186e-06,
"loss": 0.0142,
"step": 12340
},
{
"epoch": 3.156952965235174,
"grad_norm": 0.579308807849884,
"learning_rate": 3.599355864453357e-06,
"loss": 0.0074,
"step": 12350
},
{
"epoch": 3.1595092024539877,
"grad_norm": 0.3662513494491577,
"learning_rate": 3.5907924015798697e-06,
"loss": 0.0133,
"step": 12360
},
{
"epoch": 3.1620654396728014,
"grad_norm": 0.22020725905895233,
"learning_rate": 3.5822334266964454e-06,
"loss": 0.0245,
"step": 12370
},
{
"epoch": 3.1646216768916156,
"grad_norm": 0.26042699813842773,
"learning_rate": 3.573678967061374e-06,
"loss": 0.0039,
"step": 12380
},
{
"epoch": 3.1671779141104293,
"grad_norm": 4.502334117889404,
"learning_rate": 3.5651290499185752e-06,
"loss": 0.0135,
"step": 12390
},
{
"epoch": 3.1697341513292434,
"grad_norm": 0.07907534390687943,
"learning_rate": 3.556583702497489e-06,
"loss": 0.0058,
"step": 12400
},
{
"epoch": 3.172290388548057,
"grad_norm": 0.012879629619419575,
"learning_rate": 3.5480429520130144e-06,
"loss": 0.018,
"step": 12410
},
{
"epoch": 3.1748466257668713,
"grad_norm": 0.1027621328830719,
"learning_rate": 3.5395068256653984e-06,
"loss": 0.0055,
"step": 12420
},
{
"epoch": 3.177402862985685,
"grad_norm": 2.4270403385162354,
"learning_rate": 3.5309753506401747e-06,
"loss": 0.0186,
"step": 12430
},
{
"epoch": 3.179959100204499,
"grad_norm": 0.0203610397875309,
"learning_rate": 3.5224485541080476e-06,
"loss": 0.011,
"step": 12440
},
{
"epoch": 3.182515337423313,
"grad_norm": 3.286555528640747,
"learning_rate": 3.513926463224836e-06,
"loss": 0.0051,
"step": 12450
},
{
"epoch": 3.185071574642127,
"grad_norm": 0.15632130205631256,
"learning_rate": 3.5054091051313666e-06,
"loss": 0.0061,
"step": 12460
},
{
"epoch": 3.1876278118609407,
"grad_norm": 1.8245761394500732,
"learning_rate": 3.49689650695339e-06,
"loss": 0.0151,
"step": 12470
},
{
"epoch": 3.190184049079755,
"grad_norm": 0.6735230088233948,
"learning_rate": 3.4883886958015046e-06,
"loss": 0.0129,
"step": 12480
},
{
"epoch": 3.1927402862985685,
"grad_norm": 1.4515380859375,
"learning_rate": 3.4798856987710574e-06,
"loss": 0.0222,
"step": 12490
},
{
"epoch": 3.1952965235173822,
"grad_norm": 0.036662183701992035,
"learning_rate": 3.4713875429420656e-06,
"loss": 0.0235,
"step": 12500
},
{
"epoch": 3.1978527607361964,
"grad_norm": 2.479926109313965,
"learning_rate": 3.4628942553791285e-06,
"loss": 0.0075,
"step": 12510
},
{
"epoch": 3.20040899795501,
"grad_norm": 0.033283405005931854,
"learning_rate": 3.4544058631313427e-06,
"loss": 0.0105,
"step": 12520
},
{
"epoch": 3.2029652351738243,
"grad_norm": 0.06612569093704224,
"learning_rate": 3.44592239323221e-06,
"loss": 0.0143,
"step": 12530
},
{
"epoch": 3.205521472392638,
"grad_norm": 0.0648500844836235,
"learning_rate": 3.4374438726995614e-06,
"loss": 0.0086,
"step": 12540
},
{
"epoch": 3.208077709611452,
"grad_norm": 0.08395984768867493,
"learning_rate": 3.4289703285354587e-06,
"loss": 0.0105,
"step": 12550
},
{
"epoch": 3.210633946830266,
"grad_norm": 1.128602147102356,
"learning_rate": 3.4205017877261244e-06,
"loss": 0.0157,
"step": 12560
},
{
"epoch": 3.21319018404908,
"grad_norm": 0.026443956419825554,
"learning_rate": 3.4120382772418346e-06,
"loss": 0.0075,
"step": 12570
},
{
"epoch": 3.2157464212678937,
"grad_norm": 0.2616029679775238,
"learning_rate": 3.4035798240368578e-06,
"loss": 0.0085,
"step": 12580
},
{
"epoch": 3.2183026584867074,
"grad_norm": 3.7674038410186768,
"learning_rate": 3.3951264550493433e-06,
"loss": 0.0166,
"step": 12590
},
{
"epoch": 3.2208588957055215,
"grad_norm": 4.487175941467285,
"learning_rate": 3.3866781972012602e-06,
"loss": 0.0082,
"step": 12600
},
{
"epoch": 3.2234151329243352,
"grad_norm": 4.683178424835205,
"learning_rate": 3.378235077398292e-06,
"loss": 0.0081,
"step": 12610
},
{
"epoch": 3.2259713701431494,
"grad_norm": 0.07378882169723511,
"learning_rate": 3.369797122529762e-06,
"loss": 0.0126,
"step": 12620
},
{
"epoch": 3.228527607361963,
"grad_norm": 0.04591992124915123,
"learning_rate": 3.3613643594685436e-06,
"loss": 0.0069,
"step": 12630
},
{
"epoch": 3.2310838445807772,
"grad_norm": 0.039997998625040054,
"learning_rate": 3.3529368150709762e-06,
"loss": 0.0084,
"step": 12640
},
{
"epoch": 3.233640081799591,
"grad_norm": 0.03221229463815689,
"learning_rate": 3.344514516176778e-06,
"loss": 0.0148,
"step": 12650
},
{
"epoch": 3.236196319018405,
"grad_norm": 2.5336620807647705,
"learning_rate": 3.336097489608962e-06,
"loss": 0.0144,
"step": 12660
},
{
"epoch": 3.238752556237219,
"grad_norm": 0.19575847685337067,
"learning_rate": 3.3276857621737495e-06,
"loss": 0.009,
"step": 12670
},
{
"epoch": 3.2413087934560325,
"grad_norm": 4.261199951171875,
"learning_rate": 3.3192793606604877e-06,
"loss": 0.0123,
"step": 12680
},
{
"epoch": 3.2438650306748467,
"grad_norm": 3.218693733215332,
"learning_rate": 3.3108783118415583e-06,
"loss": 0.0124,
"step": 12690
},
{
"epoch": 3.2464212678936604,
"grad_norm": 0.16256259381771088,
"learning_rate": 3.3024826424722993e-06,
"loss": 0.0139,
"step": 12700
},
{
"epoch": 3.2489775051124745,
"grad_norm": 3.9794180393218994,
"learning_rate": 3.2940923792909134e-06,
"loss": 0.0163,
"step": 12710
},
{
"epoch": 3.2515337423312882,
"grad_norm": 0.19562911987304688,
"learning_rate": 3.28570754901839e-06,
"loss": 0.0087,
"step": 12720
},
{
"epoch": 3.2540899795501024,
"grad_norm": 1.9741108417510986,
"learning_rate": 3.2773281783584104e-06,
"loss": 0.0221,
"step": 12730
},
{
"epoch": 3.256646216768916,
"grad_norm": 5.769931793212891,
"learning_rate": 3.2689542939972742e-06,
"loss": 0.0191,
"step": 12740
},
{
"epoch": 3.2592024539877302,
"grad_norm": 3.071200370788574,
"learning_rate": 3.2605859226038038e-06,
"loss": 0.0333,
"step": 12750
},
{
"epoch": 3.261758691206544,
"grad_norm": 0.05902179330587387,
"learning_rate": 3.2522230908292674e-06,
"loss": 0.0056,
"step": 12760
},
{
"epoch": 3.2643149284253576,
"grad_norm": 4.801800727844238,
"learning_rate": 3.243865825307286e-06,
"loss": 0.03,
"step": 12770
},
{
"epoch": 3.266871165644172,
"grad_norm": 0.9842033386230469,
"learning_rate": 3.2355141526537636e-06,
"loss": 0.0188,
"step": 12780
},
{
"epoch": 3.2694274028629855,
"grad_norm": 0.02561868727207184,
"learning_rate": 3.2271680994667776e-06,
"loss": 0.0064,
"step": 12790
},
{
"epoch": 3.2719836400817996,
"grad_norm": 0.22291143238544464,
"learning_rate": 3.2188276923265237e-06,
"loss": 0.0054,
"step": 12800
},
{
"epoch": 3.2745398773006134,
"grad_norm": 2.7589244842529297,
"learning_rate": 3.2104929577952028e-06,
"loss": 0.0211,
"step": 12810
},
{
"epoch": 3.2770961145194275,
"grad_norm": 0.16009178757667542,
"learning_rate": 3.2021639224169615e-06,
"loss": 0.0069,
"step": 12820
},
{
"epoch": 3.279652351738241,
"grad_norm": 0.02730882354080677,
"learning_rate": 3.1938406127177878e-06,
"loss": 0.0145,
"step": 12830
},
{
"epoch": 3.2822085889570554,
"grad_norm": 0.1984373927116394,
"learning_rate": 3.1855230552054395e-06,
"loss": 0.0114,
"step": 12840
},
{
"epoch": 3.284764826175869,
"grad_norm": 1.7443758249282837,
"learning_rate": 3.177211276369351e-06,
"loss": 0.0084,
"step": 12850
},
{
"epoch": 3.287321063394683,
"grad_norm": 1.0765767097473145,
"learning_rate": 3.1689053026805573e-06,
"loss": 0.0055,
"step": 12860
},
{
"epoch": 3.289877300613497,
"grad_norm": 0.25870829820632935,
"learning_rate": 3.160605160591602e-06,
"loss": 0.0189,
"step": 12870
},
{
"epoch": 3.292433537832311,
"grad_norm": 0.007034212350845337,
"learning_rate": 3.1523108765364598e-06,
"loss": 0.0059,
"step": 12880
},
{
"epoch": 3.294989775051125,
"grad_norm": 3.2303411960601807,
"learning_rate": 3.1440224769304446e-06,
"loss": 0.009,
"step": 12890
},
{
"epoch": 3.2975460122699385,
"grad_norm": 3.338958978652954,
"learning_rate": 3.1357399881701326e-06,
"loss": 0.0126,
"step": 12900
},
{
"epoch": 3.3001022494887526,
"grad_norm": 0.04261789843440056,
"learning_rate": 3.1274634366332775e-06,
"loss": 0.004,
"step": 12910
},
{
"epoch": 3.3026584867075663,
"grad_norm": 0.30257269740104675,
"learning_rate": 3.119192848678717e-06,
"loss": 0.0025,
"step": 12920
},
{
"epoch": 3.3052147239263805,
"grad_norm": 0.821662962436676,
"learning_rate": 3.110928250646307e-06,
"loss": 0.0129,
"step": 12930
},
{
"epoch": 3.307770961145194,
"grad_norm": 0.3390525281429291,
"learning_rate": 3.1026696688568137e-06,
"loss": 0.0106,
"step": 12940
},
{
"epoch": 3.3103271983640083,
"grad_norm": 0.07365961372852325,
"learning_rate": 3.0944171296118574e-06,
"loss": 0.0271,
"step": 12950
},
{
"epoch": 3.312883435582822,
"grad_norm": 0.03872542828321457,
"learning_rate": 3.0861706591938013e-06,
"loss": 0.0106,
"step": 12960
},
{
"epoch": 3.315439672801636,
"grad_norm": 0.08862084150314331,
"learning_rate": 3.0779302838656906e-06,
"loss": 0.0046,
"step": 12970
},
{
"epoch": 3.31799591002045,
"grad_norm": 4.23959493637085,
"learning_rate": 3.0696960298711525e-06,
"loss": 0.0028,
"step": 12980
},
{
"epoch": 3.3205521472392636,
"grad_norm": 0.6766570806503296,
"learning_rate": 3.0614679234343242e-06,
"loss": 0.0076,
"step": 12990
},
{
"epoch": 3.3231083844580778,
"grad_norm": 6.0040130615234375,
"learning_rate": 3.05324599075976e-06,
"loss": 0.0292,
"step": 13000
},
{
"epoch": 3.3256646216768915,
"grad_norm": 1.5274661779403687,
"learning_rate": 3.0450302580323553e-06,
"loss": 0.0104,
"step": 13010
},
{
"epoch": 3.3282208588957056,
"grad_norm": 2.2926924228668213,
"learning_rate": 3.036820751417259e-06,
"loss": 0.038,
"step": 13020
},
{
"epoch": 3.3307770961145193,
"grad_norm": 2.260282278060913,
"learning_rate": 3.0286174970597916e-06,
"loss": 0.0122,
"step": 13030
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.12744282186031342,
"learning_rate": 3.02042052108536e-06,
"loss": 0.0079,
"step": 13040
},
{
"epoch": 3.335889570552147,
"grad_norm": 0.016303053125739098,
"learning_rate": 3.0122298495993803e-06,
"loss": 0.0297,
"step": 13050
},
{
"epoch": 3.3384458077709613,
"grad_norm": 0.024792036041617393,
"learning_rate": 3.0040455086871846e-06,
"loss": 0.015,
"step": 13060
},
{
"epoch": 3.341002044989775,
"grad_norm": 3.460494041442871,
"learning_rate": 2.995867524413949e-06,
"loss": 0.0163,
"step": 13070
},
{
"epoch": 3.3435582822085887,
"grad_norm": 0.18595871329307556,
"learning_rate": 2.9876959228246006e-06,
"loss": 0.0047,
"step": 13080
},
{
"epoch": 3.346114519427403,
"grad_norm": 3.1711583137512207,
"learning_rate": 2.9795307299437425e-06,
"loss": 0.0171,
"step": 13090
},
{
"epoch": 3.3486707566462166,
"grad_norm": 0.08736535161733627,
"learning_rate": 2.971371971775565e-06,
"loss": 0.0196,
"step": 13100
},
{
"epoch": 3.3512269938650308,
"grad_norm": 0.05008082464337349,
"learning_rate": 2.96321967430377e-06,
"loss": 0.0042,
"step": 13110
},
{
"epoch": 3.3537832310838445,
"grad_norm": 2.4688546657562256,
"learning_rate": 2.9550738634914765e-06,
"loss": 0.0086,
"step": 13120
},
{
"epoch": 3.3563394683026586,
"grad_norm": 2.240190267562866,
"learning_rate": 2.946934565281151e-06,
"loss": 0.0203,
"step": 13130
},
{
"epoch": 3.3588957055214723,
"grad_norm": 0.22563564777374268,
"learning_rate": 2.9388018055945157e-06,
"loss": 0.0028,
"step": 13140
},
{
"epoch": 3.3614519427402865,
"grad_norm": 3.0712578296661377,
"learning_rate": 2.930675610332473e-06,
"loss": 0.019,
"step": 13150
},
{
"epoch": 3.3640081799591,
"grad_norm": 2.705103635787964,
"learning_rate": 2.9225560053750113e-06,
"loss": 0.0041,
"step": 13160
},
{
"epoch": 3.366564417177914,
"grad_norm": 0.05551149323582649,
"learning_rate": 2.9144430165811423e-06,
"loss": 0.0132,
"step": 13170
},
{
"epoch": 3.369120654396728,
"grad_norm": 0.1891528069972992,
"learning_rate": 2.9063366697887947e-06,
"loss": 0.0135,
"step": 13180
},
{
"epoch": 3.3716768916155417,
"grad_norm": 0.014974648132920265,
"learning_rate": 2.898236990814751e-06,
"loss": 0.0119,
"step": 13190
},
{
"epoch": 3.374233128834356,
"grad_norm": 3.4752650260925293,
"learning_rate": 2.890144005454557e-06,
"loss": 0.0181,
"step": 13200
},
{
"epoch": 3.3767893660531696,
"grad_norm": 2.042994976043701,
"learning_rate": 2.8820577394824433e-06,
"loss": 0.0029,
"step": 13210
},
{
"epoch": 3.3793456032719837,
"grad_norm": 7.3295769691467285,
"learning_rate": 2.873978218651233e-06,
"loss": 0.0173,
"step": 13220
},
{
"epoch": 3.3819018404907975,
"grad_norm": 0.04749654605984688,
"learning_rate": 2.8659054686922757e-06,
"loss": 0.0123,
"step": 13230
},
{
"epoch": 3.3844580777096116,
"grad_norm": 0.024615732952952385,
"learning_rate": 2.8578395153153536e-06,
"loss": 0.0077,
"step": 13240
},
{
"epoch": 3.3870143149284253,
"grad_norm": 0.014985025860369205,
"learning_rate": 2.849780384208607e-06,
"loss": 0.0039,
"step": 13250
},
{
"epoch": 3.3895705521472395,
"grad_norm": 0.13005146384239197,
"learning_rate": 2.8417281010384396e-06,
"loss": 0.0251,
"step": 13260
},
{
"epoch": 3.392126789366053,
"grad_norm": 0.07327497750520706,
"learning_rate": 2.8336826914494607e-06,
"loss": 0.0027,
"step": 13270
},
{
"epoch": 3.3946830265848673,
"grad_norm": 0.07815208286046982,
"learning_rate": 2.8256441810643755e-06,
"loss": 0.0119,
"step": 13280
},
{
"epoch": 3.397239263803681,
"grad_norm": 1.9264451265335083,
"learning_rate": 2.8176125954839247e-06,
"loss": 0.0107,
"step": 13290
},
{
"epoch": 3.3997955010224947,
"grad_norm": 3.673927068710327,
"learning_rate": 2.8095879602867877e-06,
"loss": 0.0077,
"step": 13300
},
{
"epoch": 3.402351738241309,
"grad_norm": 2.514970064163208,
"learning_rate": 2.8015703010295214e-06,
"loss": 0.0301,
"step": 13310
},
{
"epoch": 3.4049079754601226,
"grad_norm": 2.072049379348755,
"learning_rate": 2.793559643246451e-06,
"loss": 0.0028,
"step": 13320
},
{
"epoch": 3.4074642126789367,
"grad_norm": 2.3494277000427246,
"learning_rate": 2.7855560124496146e-06,
"loss": 0.0079,
"step": 13330
},
{
"epoch": 3.4100204498977504,
"grad_norm": 2.0031983852386475,
"learning_rate": 2.777559434128666e-06,
"loss": 0.0137,
"step": 13340
},
{
"epoch": 3.4125766871165646,
"grad_norm": 4.773671627044678,
"learning_rate": 2.7695699337507996e-06,
"loss": 0.0102,
"step": 13350
},
{
"epoch": 3.4151329243353783,
"grad_norm": 0.5617696642875671,
"learning_rate": 2.7615875367606704e-06,
"loss": 0.0155,
"step": 13360
},
{
"epoch": 3.4176891615541924,
"grad_norm": 5.82913875579834,
"learning_rate": 2.753612268580306e-06,
"loss": 0.0117,
"step": 13370
},
{
"epoch": 3.420245398773006,
"grad_norm": 0.17889423668384552,
"learning_rate": 2.7456441546090335e-06,
"loss": 0.0077,
"step": 13380
},
{
"epoch": 3.42280163599182,
"grad_norm": 3.2761387825012207,
"learning_rate": 2.7376832202233962e-06,
"loss": 0.0039,
"step": 13390
},
{
"epoch": 3.425357873210634,
"grad_norm": 0.0072940983809530735,
"learning_rate": 2.7297294907770735e-06,
"loss": 0.0059,
"step": 13400
},
{
"epoch": 3.4279141104294477,
"grad_norm": 0.05972537025809288,
"learning_rate": 2.7217829916007888e-06,
"loss": 0.0119,
"step": 13410
},
{
"epoch": 3.430470347648262,
"grad_norm": 1.3629683256149292,
"learning_rate": 2.713843748002256e-06,
"loss": 0.0102,
"step": 13420
},
{
"epoch": 3.4330265848670756,
"grad_norm": 2.336515188217163,
"learning_rate": 2.7059117852660667e-06,
"loss": 0.0082,
"step": 13430
},
{
"epoch": 3.4355828220858897,
"grad_norm": 0.766312837600708,
"learning_rate": 2.697987128653633e-06,
"loss": 0.0148,
"step": 13440
},
{
"epoch": 3.4381390593047034,
"grad_norm": 0.01915799267590046,
"learning_rate": 2.6900698034030904e-06,
"loss": 0.0027,
"step": 13450
},
{
"epoch": 3.4406952965235176,
"grad_norm": 6.4156646728515625,
"learning_rate": 2.6821598347292387e-06,
"loss": 0.0227,
"step": 13460
},
{
"epoch": 3.4432515337423313,
"grad_norm": 1.6114623546600342,
"learning_rate": 2.6742572478234363e-06,
"loss": 0.0045,
"step": 13470
},
{
"epoch": 3.445807770961145,
"grad_norm": 0.04842757061123848,
"learning_rate": 2.6663620678535396e-06,
"loss": 0.0031,
"step": 13480
},
{
"epoch": 3.448364008179959,
"grad_norm": 4.460205554962158,
"learning_rate": 2.658474319963812e-06,
"loss": 0.0242,
"step": 13490
},
{
"epoch": 3.450920245398773,
"grad_norm": 1.2775579690933228,
"learning_rate": 2.650594029274853e-06,
"loss": 0.0083,
"step": 13500
},
{
"epoch": 3.453476482617587,
"grad_norm": 8.932818412780762,
"learning_rate": 2.642721220883503e-06,
"loss": 0.0197,
"step": 13510
},
{
"epoch": 3.4560327198364007,
"grad_norm": 2.6447277069091797,
"learning_rate": 2.634855919862782e-06,
"loss": 0.0086,
"step": 13520
},
{
"epoch": 3.458588957055215,
"grad_norm": 4.694246292114258,
"learning_rate": 2.626998151261798e-06,
"loss": 0.0063,
"step": 13530
},
{
"epoch": 3.4611451942740286,
"grad_norm": 5.1632914543151855,
"learning_rate": 2.61914794010567e-06,
"loss": 0.0071,
"step": 13540
},
{
"epoch": 3.4637014314928427,
"grad_norm": 0.45551520586013794,
"learning_rate": 2.6113053113954456e-06,
"loss": 0.0198,
"step": 13550
},
{
"epoch": 3.4662576687116564,
"grad_norm": 0.023942044004797935,
"learning_rate": 2.6034702901080278e-06,
"loss": 0.0098,
"step": 13560
},
{
"epoch": 3.46881390593047,
"grad_norm": 1.2750016450881958,
"learning_rate": 2.5956429011960905e-06,
"loss": 0.0101,
"step": 13570
},
{
"epoch": 3.4713701431492843,
"grad_norm": 4.26313591003418,
"learning_rate": 2.5878231695880023e-06,
"loss": 0.0115,
"step": 13580
},
{
"epoch": 3.473926380368098,
"grad_norm": 0.28257378935813904,
"learning_rate": 2.5800111201877397e-06,
"loss": 0.0079,
"step": 13590
},
{
"epoch": 3.476482617586912,
"grad_norm": 0.5308012962341309,
"learning_rate": 2.572206777874818e-06,
"loss": 0.0096,
"step": 13600
},
{
"epoch": 3.479038854805726,
"grad_norm": 4.8633341789245605,
"learning_rate": 2.5644101675042066e-06,
"loss": 0.021,
"step": 13610
},
{
"epoch": 3.48159509202454,
"grad_norm": 2.2458882331848145,
"learning_rate": 2.5566213139062502e-06,
"loss": 0.0071,
"step": 13620
},
{
"epoch": 3.4841513292433537,
"grad_norm": 0.1293790638446808,
"learning_rate": 2.5488402418865854e-06,
"loss": 0.0114,
"step": 13630
},
{
"epoch": 3.486707566462168,
"grad_norm": 0.014333824627101421,
"learning_rate": 2.5410669762260788e-06,
"loss": 0.0146,
"step": 13640
},
{
"epoch": 3.4892638036809815,
"grad_norm": 4.425572395324707,
"learning_rate": 2.5333015416807192e-06,
"loss": 0.0093,
"step": 13650
},
{
"epoch": 3.4918200408997953,
"grad_norm": 0.04234839603304863,
"learning_rate": 2.525543962981569e-06,
"loss": 0.0049,
"step": 13660
},
{
"epoch": 3.4943762781186094,
"grad_norm": 0.4814109802246094,
"learning_rate": 2.5177942648346597e-06,
"loss": 0.0059,
"step": 13670
},
{
"epoch": 3.4969325153374236,
"grad_norm": 0.25284790992736816,
"learning_rate": 2.5100524719209387e-06,
"loss": 0.0086,
"step": 13680
},
{
"epoch": 3.4994887525562373,
"grad_norm": 2.6780126094818115,
"learning_rate": 2.502318608896165e-06,
"loss": 0.0078,
"step": 13690
},
{
"epoch": 3.502044989775051,
"grad_norm": 1.6357485055923462,
"learning_rate": 2.494592700390848e-06,
"loss": 0.0047,
"step": 13700
},
{
"epoch": 3.504601226993865,
"grad_norm": 0.4582887887954712,
"learning_rate": 2.4868747710101647e-06,
"loss": 0.0093,
"step": 13710
},
{
"epoch": 3.507157464212679,
"grad_norm": 1.8089367151260376,
"learning_rate": 2.479164845333881e-06,
"loss": 0.0039,
"step": 13720
},
{
"epoch": 3.509713701431493,
"grad_norm": 3.7371037006378174,
"learning_rate": 2.471462947916267e-06,
"loss": 0.0095,
"step": 13730
},
{
"epoch": 3.5122699386503067,
"grad_norm": 0.04978760704398155,
"learning_rate": 2.4637691032860306e-06,
"loss": 0.0093,
"step": 13740
},
{
"epoch": 3.5148261758691204,
"grad_norm": 0.17964474856853485,
"learning_rate": 2.456083335946232e-06,
"loss": 0.0245,
"step": 13750
},
{
"epoch": 3.5173824130879345,
"grad_norm": 0.01520370040088892,
"learning_rate": 2.4484056703742083e-06,
"loss": 0.01,
"step": 13760
},
{
"epoch": 3.5199386503067487,
"grad_norm": 0.04782997816801071,
"learning_rate": 2.4407361310214893e-06,
"loss": 0.0102,
"step": 13770
},
{
"epoch": 3.5224948875255624,
"grad_norm": 0.04237792268395424,
"learning_rate": 2.4330747423137314e-06,
"loss": 0.0059,
"step": 13780
},
{
"epoch": 3.525051124744376,
"grad_norm": 0.24677464365959167,
"learning_rate": 2.4254215286506287e-06,
"loss": 0.0035,
"step": 13790
},
{
"epoch": 3.5276073619631902,
"grad_norm": 2.3235230445861816,
"learning_rate": 2.4177765144058424e-06,
"loss": 0.008,
"step": 13800
},
{
"epoch": 3.530163599182004,
"grad_norm": 0.09863277524709702,
"learning_rate": 2.4101397239269202e-06,
"loss": 0.0169,
"step": 13810
},
{
"epoch": 3.532719836400818,
"grad_norm": 0.050358258187770844,
"learning_rate": 2.402511181535213e-06,
"loss": 0.0032,
"step": 13820
},
{
"epoch": 3.535276073619632,
"grad_norm": 0.08366558700799942,
"learning_rate": 2.3948909115258163e-06,
"loss": 0.005,
"step": 13830
},
{
"epoch": 3.537832310838446,
"grad_norm": 0.028095854446291924,
"learning_rate": 2.3872789381674665e-06,
"loss": 0.0131,
"step": 13840
},
{
"epoch": 3.5403885480572597,
"grad_norm": 0.010922097600996494,
"learning_rate": 2.3796752857024854e-06,
"loss": 0.0127,
"step": 13850
},
{
"epoch": 3.542944785276074,
"grad_norm": 5.2768330574035645,
"learning_rate": 2.372079978346691e-06,
"loss": 0.004,
"step": 13860
},
{
"epoch": 3.5455010224948875,
"grad_norm": 5.860825061798096,
"learning_rate": 2.3644930402893297e-06,
"loss": 0.0121,
"step": 13870
},
{
"epoch": 3.5480572597137012,
"grad_norm": 0.030172038823366165,
"learning_rate": 2.356914495692984e-06,
"loss": 0.0014,
"step": 13880
},
{
"epoch": 3.5506134969325154,
"grad_norm": 0.023287015035748482,
"learning_rate": 2.349344368693513e-06,
"loss": 0.0078,
"step": 13890
},
{
"epoch": 3.553169734151329,
"grad_norm": 0.010513374581933022,
"learning_rate": 2.3417826833999657e-06,
"loss": 0.0075,
"step": 13900
},
{
"epoch": 3.5557259713701432,
"grad_norm": 3.824662923812866,
"learning_rate": 2.3342294638945077e-06,
"loss": 0.0234,
"step": 13910
},
{
"epoch": 3.558282208588957,
"grad_norm": 1.5583800077438354,
"learning_rate": 2.3266847342323377e-06,
"loss": 0.0024,
"step": 13920
},
{
"epoch": 3.560838445807771,
"grad_norm": 0.9682608842849731,
"learning_rate": 2.319148518441622e-06,
"loss": 0.0043,
"step": 13930
},
{
"epoch": 3.563394683026585,
"grad_norm": 0.0384635366499424,
"learning_rate": 2.3116208405234107e-06,
"loss": 0.006,
"step": 13940
},
{
"epoch": 3.565950920245399,
"grad_norm": 0.4134227931499481,
"learning_rate": 2.304101724451564e-06,
"loss": 0.0118,
"step": 13950
},
{
"epoch": 3.5685071574642127,
"grad_norm": 0.014091679826378822,
"learning_rate": 2.2965911941726687e-06,
"loss": 0.0034,
"step": 13960
},
{
"epoch": 3.5710633946830264,
"grad_norm": 0.21840809285640717,
"learning_rate": 2.289089273605975e-06,
"loss": 0.0055,
"step": 13970
},
{
"epoch": 3.5736196319018405,
"grad_norm": 0.015261857770383358,
"learning_rate": 2.2815959866433096e-06,
"loss": 0.0019,
"step": 13980
},
{
"epoch": 3.5761758691206547,
"grad_norm": 4.033803939819336,
"learning_rate": 2.2741113571490066e-06,
"loss": 0.0131,
"step": 13990
},
{
"epoch": 3.5787321063394684,
"grad_norm": 0.08580244332551956,
"learning_rate": 2.2666354089598198e-06,
"loss": 0.0133,
"step": 14000
},
{
"epoch": 3.581288343558282,
"grad_norm": 0.17088328301906586,
"learning_rate": 2.2591681658848686e-06,
"loss": 0.0047,
"step": 14010
},
{
"epoch": 3.5838445807770962,
"grad_norm": 2.8940188884735107,
"learning_rate": 2.251709651705535e-06,
"loss": 0.0082,
"step": 14020
},
{
"epoch": 3.58640081799591,
"grad_norm": 1.2774847745895386,
"learning_rate": 2.244259890175412e-06,
"loss": 0.0128,
"step": 14030
},
{
"epoch": 3.588957055214724,
"grad_norm": 0.8745086789131165,
"learning_rate": 2.236818905020207e-06,
"loss": 0.0056,
"step": 14040
},
{
"epoch": 3.591513292433538,
"grad_norm": 0.05803001672029495,
"learning_rate": 2.22938671993769e-06,
"loss": 0.0036,
"step": 14050
},
{
"epoch": 3.5940695296523515,
"grad_norm": 3.186616897583008,
"learning_rate": 2.221963358597593e-06,
"loss": 0.0049,
"step": 14060
},
{
"epoch": 3.5966257668711656,
"grad_norm": 0.13081157207489014,
"learning_rate": 2.214548844641552e-06,
"loss": 0.0159,
"step": 14070
},
{
"epoch": 3.59918200408998,
"grad_norm": 0.5573609471321106,
"learning_rate": 2.2071432016830257e-06,
"loss": 0.0063,
"step": 14080
},
{
"epoch": 3.6017382413087935,
"grad_norm": 0.11412039399147034,
"learning_rate": 2.1997464533072232e-06,
"loss": 0.0092,
"step": 14090
},
{
"epoch": 3.604294478527607,
"grad_norm": 2.3137636184692383,
"learning_rate": 2.1923586230710185e-06,
"loss": 0.0082,
"step": 14100
},
{
"epoch": 3.6068507157464214,
"grad_norm": 0.7297873497009277,
"learning_rate": 2.1849797345028917e-06,
"loss": 0.0057,
"step": 14110
},
{
"epoch": 3.609406952965235,
"grad_norm": 0.14575114846229553,
"learning_rate": 2.1776098111028427e-06,
"loss": 0.0122,
"step": 14120
},
{
"epoch": 3.611963190184049,
"grad_norm": 0.20701062679290771,
"learning_rate": 2.1702488763423206e-06,
"loss": 0.0116,
"step": 14130
},
{
"epoch": 3.614519427402863,
"grad_norm": 2.8510355949401855,
"learning_rate": 2.1628969536641436e-06,
"loss": 0.0094,
"step": 14140
},
{
"epoch": 3.6170756646216766,
"grad_norm": 0.13213932514190674,
"learning_rate": 2.1555540664824337e-06,
"loss": 0.0136,
"step": 14150
},
{
"epoch": 3.6196319018404908,
"grad_norm": 0.011733833700418472,
"learning_rate": 2.1482202381825356e-06,
"loss": 0.0049,
"step": 14160
},
{
"epoch": 3.622188139059305,
"grad_norm": 0.06473023444414139,
"learning_rate": 2.1408954921209435e-06,
"loss": 0.007,
"step": 14170
},
{
"epoch": 3.6247443762781186,
"grad_norm": 0.029512058943510056,
"learning_rate": 2.1335798516252243e-06,
"loss": 0.0187,
"step": 14180
},
{
"epoch": 3.6273006134969323,
"grad_norm": 4.00309944152832,
"learning_rate": 2.126273339993949e-06,
"loss": 0.0142,
"step": 14190
},
{
"epoch": 3.6298568507157465,
"grad_norm": 1.9352320432662964,
"learning_rate": 2.1189759804966142e-06,
"loss": 0.0048,
"step": 14200
},
{
"epoch": 3.63241308793456,
"grad_norm": 2.03886079788208,
"learning_rate": 2.1116877963735714e-06,
"loss": 0.0007,
"step": 14210
},
{
"epoch": 3.6349693251533743,
"grad_norm": 2.063149929046631,
"learning_rate": 2.1044088108359433e-06,
"loss": 0.0113,
"step": 14220
},
{
"epoch": 3.637525562372188,
"grad_norm": 0.1273782104253769,
"learning_rate": 2.0971390470655693e-06,
"loss": 0.008,
"step": 14230
},
{
"epoch": 3.640081799591002,
"grad_norm": 0.050878312438726425,
"learning_rate": 2.089878528214908e-06,
"loss": 0.0002,
"step": 14240
},
{
"epoch": 3.642638036809816,
"grad_norm": 0.3995646834373474,
"learning_rate": 2.082627277406983e-06,
"loss": 0.0134,
"step": 14250
},
{
"epoch": 3.64519427402863,
"grad_norm": 2.8083791732788086,
"learning_rate": 2.0753853177352945e-06,
"loss": 0.0122,
"step": 14260
},
{
"epoch": 3.6477505112474438,
"grad_norm": 0.38471710681915283,
"learning_rate": 2.0681526722637603e-06,
"loss": 0.0061,
"step": 14270
},
{
"epoch": 3.6503067484662575,
"grad_norm": 1.0761078596115112,
"learning_rate": 2.060929364026632e-06,
"loss": 0.0071,
"step": 14280
},
{
"epoch": 3.6528629856850716,
"grad_norm": 4.6696319580078125,
"learning_rate": 2.05371541602842e-06,
"loss": 0.015,
"step": 14290
},
{
"epoch": 3.6554192229038853,
"grad_norm": 1.2931352853775024,
"learning_rate": 2.0465108512438285e-06,
"loss": 0.0105,
"step": 14300
},
{
"epoch": 3.6579754601226995,
"grad_norm": 0.30030888319015503,
"learning_rate": 2.0393156926176796e-06,
"loss": 0.0035,
"step": 14310
},
{
"epoch": 3.660531697341513,
"grad_norm": 1.4162043333053589,
"learning_rate": 2.0321299630648374e-06,
"loss": 0.007,
"step": 14320
},
{
"epoch": 3.6630879345603273,
"grad_norm": 1.6966540813446045,
"learning_rate": 2.0249536854701335e-06,
"loss": 0.0022,
"step": 14330
},
{
"epoch": 3.665644171779141,
"grad_norm": 2.748809337615967,
"learning_rate": 2.017786882688303e-06,
"loss": 0.0059,
"step": 14340
},
{
"epoch": 3.668200408997955,
"grad_norm": 3.920806646347046,
"learning_rate": 2.0106295775439018e-06,
"loss": 0.0024,
"step": 14350
},
{
"epoch": 3.670756646216769,
"grad_norm": 4.018367290496826,
"learning_rate": 2.003481792831242e-06,
"loss": 0.0134,
"step": 14360
},
{
"epoch": 3.6733128834355826,
"grad_norm": 0.7412097454071045,
"learning_rate": 1.9963435513143076e-06,
"loss": 0.0061,
"step": 14370
},
{
"epoch": 3.6758691206543967,
"grad_norm": 0.00914350152015686,
"learning_rate": 1.989214875726702e-06,
"loss": 0.0037,
"step": 14380
},
{
"epoch": 3.678425357873211,
"grad_norm": 0.7989885210990906,
"learning_rate": 1.982095788771552e-06,
"loss": 0.0081,
"step": 14390
},
{
"epoch": 3.6809815950920246,
"grad_norm": 0.09935598075389862,
"learning_rate": 1.9749863131214543e-06,
"loss": 0.0057,
"step": 14400
},
{
"epoch": 3.6835378323108383,
"grad_norm": 0.021534953266382217,
"learning_rate": 1.9678864714183877e-06,
"loss": 0.0009,
"step": 14410
},
{
"epoch": 3.6860940695296525,
"grad_norm": 1.7669703960418701,
"learning_rate": 1.9607962862736617e-06,
"loss": 0.004,
"step": 14420
},
{
"epoch": 3.688650306748466,
"grad_norm": 3.493924856185913,
"learning_rate": 1.9537157802678196e-06,
"loss": 0.0012,
"step": 14430
},
{
"epoch": 3.6912065439672803,
"grad_norm": 0.002254684455692768,
"learning_rate": 1.9466449759505856e-06,
"loss": 0.0053,
"step": 14440
},
{
"epoch": 3.693762781186094,
"grad_norm": 3.5533618927001953,
"learning_rate": 1.939583895840785e-06,
"loss": 0.0053,
"step": 14450
},
{
"epoch": 3.6963190184049077,
"grad_norm": 6.355319976806641,
"learning_rate": 1.932532562426275e-06,
"loss": 0.0086,
"step": 14460
},
{
"epoch": 3.698875255623722,
"grad_norm": 0.021470896899700165,
"learning_rate": 1.925490998163868e-06,
"loss": 0.0097,
"step": 14470
},
{
"epoch": 3.701431492842536,
"grad_norm": 2.308654308319092,
"learning_rate": 1.918459225479268e-06,
"loss": 0.0156,
"step": 14480
},
{
"epoch": 3.7039877300613497,
"grad_norm": 2.9286420345306396,
"learning_rate": 1.911437266766993e-06,
"loss": 0.0076,
"step": 14490
},
{
"epoch": 3.7065439672801634,
"grad_norm": 0.0710514560341835,
"learning_rate": 1.9044251443903088e-06,
"loss": 0.0009,
"step": 14500
},
{
"epoch": 3.7091002044989776,
"grad_norm": 0.029081158339977264,
"learning_rate": 1.8974228806811496e-06,
"loss": 0.0007,
"step": 14510
},
{
"epoch": 3.7116564417177913,
"grad_norm": 4.481345176696777,
"learning_rate": 1.8904304979400557e-06,
"loss": 0.0094,
"step": 14520
},
{
"epoch": 3.7142126789366054,
"grad_norm": 0.005593888461589813,
"learning_rate": 1.8834480184360987e-06,
"loss": 0.0025,
"step": 14530
},
{
"epoch": 3.716768916155419,
"grad_norm": 0.050757136195898056,
"learning_rate": 1.8764754644068122e-06,
"loss": 0.0052,
"step": 14540
},
{
"epoch": 3.719325153374233,
"grad_norm": 0.02077576145529747,
"learning_rate": 1.8695128580581146e-06,
"loss": 0.0015,
"step": 14550
},
{
"epoch": 3.721881390593047,
"grad_norm": 0.041414808481931686,
"learning_rate": 1.862560221564247e-06,
"loss": 0.0077,
"step": 14560
},
{
"epoch": 3.724437627811861,
"grad_norm": 0.014929791912436485,
"learning_rate": 1.8556175770676987e-06,
"loss": 0.0033,
"step": 14570
},
{
"epoch": 3.726993865030675,
"grad_norm": 0.21779873967170715,
"learning_rate": 1.8486849466791385e-06,
"loss": 0.0058,
"step": 14580
},
{
"epoch": 3.7295501022494886,
"grad_norm": 0.025204051285982132,
"learning_rate": 1.8417623524773343e-06,
"loss": 0.0102,
"step": 14590
},
{
"epoch": 3.7321063394683027,
"grad_norm": 0.015351396054029465,
"learning_rate": 1.8348498165091056e-06,
"loss": 0.0017,
"step": 14600
},
{
"epoch": 3.7346625766871164,
"grad_norm": 0.05748201906681061,
"learning_rate": 1.827947360789225e-06,
"loss": 0.0054,
"step": 14610
},
{
"epoch": 3.7372188139059306,
"grad_norm": 1.63164484500885,
"learning_rate": 1.8210550073003701e-06,
"loss": 0.0067,
"step": 14620
},
{
"epoch": 3.7397750511247443,
"grad_norm": 0.021220263093709946,
"learning_rate": 1.814172777993039e-06,
"loss": 0.0112,
"step": 14630
},
{
"epoch": 3.7423312883435584,
"grad_norm": 1.962134599685669,
"learning_rate": 1.807300694785496e-06,
"loss": 0.0066,
"step": 14640
},
{
"epoch": 3.744887525562372,
"grad_norm": 0.02569643221795559,
"learning_rate": 1.800438779563683e-06,
"loss": 0.01,
"step": 14650
},
{
"epoch": 3.7474437627811863,
"grad_norm": 0.10192188620567322,
"learning_rate": 1.7935870541811633e-06,
"loss": 0.0025,
"step": 14660
},
{
"epoch": 3.75,
"grad_norm": 0.06718003004789352,
"learning_rate": 1.7867455404590495e-06,
"loss": 0.0014,
"step": 14670
},
{
"epoch": 3.7525562372188137,
"grad_norm": 0.01870041899383068,
"learning_rate": 1.7799142601859322e-06,
"loss": 0.0062,
"step": 14680
},
{
"epoch": 3.755112474437628,
"grad_norm": 3.080137014389038,
"learning_rate": 1.7730932351178055e-06,
"loss": 0.0049,
"step": 14690
},
{
"epoch": 3.7576687116564416,
"grad_norm": 0.33492809534072876,
"learning_rate": 1.7662824869780094e-06,
"loss": 0.0088,
"step": 14700
},
{
"epoch": 3.7602249488752557,
"grad_norm": 0.7548993825912476,
"learning_rate": 1.759482037457152e-06,
"loss": 0.0021,
"step": 14710
},
{
"epoch": 3.7627811860940694,
"grad_norm": 0.2977140247821808,
"learning_rate": 1.7526919082130434e-06,
"loss": 0.0089,
"step": 14720
},
{
"epoch": 3.7653374233128836,
"grad_norm": 0.009994206950068474,
"learning_rate": 1.7459121208706264e-06,
"loss": 0.0069,
"step": 14730
},
{
"epoch": 3.7678936605316973,
"grad_norm": 1.6630052328109741,
"learning_rate": 1.7391426970219021e-06,
"loss": 0.0103,
"step": 14740
},
{
"epoch": 3.7704498977505114,
"grad_norm": 1.2915098667144775,
"learning_rate": 1.7323836582258774e-06,
"loss": 0.0079,
"step": 14750
},
{
"epoch": 3.773006134969325,
"grad_norm": 3.242319345474243,
"learning_rate": 1.7256350260084736e-06,
"loss": 0.0069,
"step": 14760
},
{
"epoch": 3.775562372188139,
"grad_norm": 0.026173055171966553,
"learning_rate": 1.718896821862478e-06,
"loss": 0.0011,
"step": 14770
},
{
"epoch": 3.778118609406953,
"grad_norm": 0.021731965243816376,
"learning_rate": 1.7121690672474577e-06,
"loss": 0.0042,
"step": 14780
},
{
"epoch": 3.780674846625767,
"grad_norm": 4.898509502410889,
"learning_rate": 1.7054517835897144e-06,
"loss": 0.0178,
"step": 14790
},
{
"epoch": 3.783231083844581,
"grad_norm": 5.831714630126953,
"learning_rate": 1.6987449922821887e-06,
"loss": 0.006,
"step": 14800
},
{
"epoch": 3.7857873210633946,
"grad_norm": 0.009105149656534195,
"learning_rate": 1.6920487146844117e-06,
"loss": 0.0012,
"step": 14810
},
{
"epoch": 3.7883435582822087,
"grad_norm": 0.0681765154004097,
"learning_rate": 1.6853629721224318e-06,
"loss": 0.0064,
"step": 14820
},
{
"epoch": 3.7908997955010224,
"grad_norm": 0.09596231579780579,
"learning_rate": 1.6786877858887457e-06,
"loss": 0.0036,
"step": 14830
},
{
"epoch": 3.7934560327198366,
"grad_norm": 0.2018987387418747,
"learning_rate": 1.6720231772422251e-06,
"loss": 0.0041,
"step": 14840
},
{
"epoch": 3.7960122699386503,
"grad_norm": 0.034721288830041885,
"learning_rate": 1.665369167408062e-06,
"loss": 0.0083,
"step": 14850
},
{
"epoch": 3.798568507157464,
"grad_norm": 0.009844356216490269,
"learning_rate": 1.6587257775776889e-06,
"loss": 0.0047,
"step": 14860
},
{
"epoch": 3.801124744376278,
"grad_norm": 0.014034909196197987,
"learning_rate": 1.6520930289087206e-06,
"loss": 0.0053,
"step": 14870
},
{
"epoch": 3.8036809815950923,
"grad_norm": 0.03924409672617912,
"learning_rate": 1.6454709425248754e-06,
"loss": 0.0053,
"step": 14880
},
{
"epoch": 3.806237218813906,
"grad_norm": 0.03811722993850708,
"learning_rate": 1.6388595395159207e-06,
"loss": 0.0107,
"step": 14890
},
{
"epoch": 3.8087934560327197,
"grad_norm": 3.9966225624084473,
"learning_rate": 1.632258840937599e-06,
"loss": 0.0111,
"step": 14900
},
{
"epoch": 3.811349693251534,
"grad_norm": 0.009593687951564789,
"learning_rate": 1.6256688678115607e-06,
"loss": 0.0138,
"step": 14910
},
{
"epoch": 3.8139059304703475,
"grad_norm": 0.011800892651081085,
"learning_rate": 1.6190896411252966e-06,
"loss": 0.0066,
"step": 14920
},
{
"epoch": 3.8164621676891617,
"grad_norm": 0.02664501592516899,
"learning_rate": 1.612521181832075e-06,
"loss": 0.0053,
"step": 14930
},
{
"epoch": 3.8190184049079754,
"grad_norm": 2.8575503826141357,
"learning_rate": 1.6059635108508731e-06,
"loss": 0.0082,
"step": 14940
},
{
"epoch": 3.821574642126789,
"grad_norm": 1.9057544469833374,
"learning_rate": 1.5994166490663087e-06,
"loss": 0.0026,
"step": 14950
},
{
"epoch": 3.8241308793456033,
"grad_norm": 0.012572694569826126,
"learning_rate": 1.5928806173285716e-06,
"loss": 0.0035,
"step": 14960
},
{
"epoch": 3.8266871165644174,
"grad_norm": 3.401106595993042,
"learning_rate": 1.58635543645337e-06,
"loss": 0.0068,
"step": 14970
},
{
"epoch": 3.829243353783231,
"grad_norm": 0.008161719888448715,
"learning_rate": 1.5798411272218427e-06,
"loss": 0.0048,
"step": 14980
},
{
"epoch": 3.831799591002045,
"grad_norm": 4.691705703735352,
"learning_rate": 1.5733377103805154e-06,
"loss": 0.0045,
"step": 14990
},
{
"epoch": 3.834355828220859,
"grad_norm": 0.011220389977097511,
"learning_rate": 1.5668452066412137e-06,
"loss": 0.0004,
"step": 15000
},
{
"epoch": 3.8369120654396727,
"grad_norm": 0.5998366475105286,
"learning_rate": 1.56036363668102e-06,
"loss": 0.0055,
"step": 15010
},
{
"epoch": 3.839468302658487,
"grad_norm": 3.6791257858276367,
"learning_rate": 1.5538930211421839e-06,
"loss": 0.0094,
"step": 15020
},
{
"epoch": 3.8420245398773005,
"grad_norm": 0.025082003325223923,
"learning_rate": 1.5474333806320735e-06,
"loss": 0.004,
"step": 15030
},
{
"epoch": 3.8445807770961147,
"grad_norm": 0.01650637947022915,
"learning_rate": 1.540984735723104e-06,
"loss": 0.0042,
"step": 15040
},
{
"epoch": 3.8471370143149284,
"grad_norm": 0.943402111530304,
"learning_rate": 1.5345471069526718e-06,
"loss": 0.0047,
"step": 15050
},
{
"epoch": 3.8496932515337425,
"grad_norm": 0.018428007140755653,
"learning_rate": 1.5281205148230866e-06,
"loss": 0.0187,
"step": 15060
},
{
"epoch": 3.8522494887525562,
"grad_norm": 0.013692053034901619,
"learning_rate": 1.5217049798015127e-06,
"loss": 0.0018,
"step": 15070
},
{
"epoch": 3.85480572597137,
"grad_norm": 0.01854683831334114,
"learning_rate": 1.5153005223198986e-06,
"loss": 0.0011,
"step": 15080
},
{
"epoch": 3.857361963190184,
"grad_norm": 0.07175463438034058,
"learning_rate": 1.5089071627749157e-06,
"loss": 0.0003,
"step": 15090
},
{
"epoch": 3.859918200408998,
"grad_norm": 0.3006972074508667,
"learning_rate": 1.5025249215278852e-06,
"loss": 0.0027,
"step": 15100
},
{
"epoch": 3.862474437627812,
"grad_norm": 0.5995022058486938,
"learning_rate": 1.4961538189047258e-06,
"loss": 0.0079,
"step": 15110
},
{
"epoch": 3.8650306748466257,
"grad_norm": 0.03315654397010803,
"learning_rate": 1.489793875195879e-06,
"loss": 0.0002,
"step": 15120
},
{
"epoch": 3.86758691206544,
"grad_norm": 0.01580039970576763,
"learning_rate": 1.4834451106562502e-06,
"loss": 0.0002,
"step": 15130
},
{
"epoch": 3.8701431492842535,
"grad_norm": 0.047737788408994675,
"learning_rate": 1.477107545505137e-06,
"loss": 0.0041,
"step": 15140
},
{
"epoch": 3.8726993865030677,
"grad_norm": 2.45046329498291,
"learning_rate": 1.470781199926174e-06,
"loss": 0.0075,
"step": 15150
},
{
"epoch": 3.8752556237218814,
"grad_norm": 3.830009698867798,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0058,
"step": 15160
},
{
"epoch": 3.877811860940695,
"grad_norm": 0.03586220741271973,
"learning_rate": 1.4581622480405095e-06,
"loss": 0.0055,
"step": 15170
},
{
"epoch": 3.8803680981595092,
"grad_norm": 0.048213325440883636,
"learning_rate": 1.45186968192216e-06,
"loss": 0.0135,
"step": 15180
},
{
"epoch": 3.8829243353783234,
"grad_norm": 0.011242564767599106,
"learning_rate": 1.4455884157525369e-06,
"loss": 0.0049,
"step": 15190
},
{
"epoch": 3.885480572597137,
"grad_norm": 0.26863622665405273,
"learning_rate": 1.4393184695359752e-06,
"loss": 0.0038,
"step": 15200
},
{
"epoch": 3.888036809815951,
"grad_norm": 0.09017948806285858,
"learning_rate": 1.4330598632407554e-06,
"loss": 0.0018,
"step": 15210
},
{
"epoch": 3.890593047034765,
"grad_norm": 0.21921706199645996,
"learning_rate": 1.4268126167990475e-06,
"loss": 0.0051,
"step": 15220
},
{
"epoch": 3.8931492842535786,
"grad_norm": 0.048430170863866806,
"learning_rate": 1.4205767501068413e-06,
"loss": 0.0027,
"step": 15230
},
{
"epoch": 3.895705521472393,
"grad_norm": 0.03785645216703415,
"learning_rate": 1.4143522830238855e-06,
"loss": 0.0022,
"step": 15240
},
{
"epoch": 3.8982617586912065,
"grad_norm": 0.018065497279167175,
"learning_rate": 1.4081392353736206e-06,
"loss": 0.0075,
"step": 15250
},
{
"epoch": 3.90081799591002,
"grad_norm": 1.4500396251678467,
"learning_rate": 1.4019376269431229e-06,
"loss": 0.0034,
"step": 15260
},
{
"epoch": 3.9033742331288344,
"grad_norm": 0.04054681211709976,
"learning_rate": 1.395747477483036e-06,
"loss": 0.0021,
"step": 15270
},
{
"epoch": 3.9059304703476485,
"grad_norm": 1.107225775718689,
"learning_rate": 1.3895688067075109e-06,
"loss": 0.0012,
"step": 15280
},
{
"epoch": 3.908486707566462,
"grad_norm": 0.14938171207904816,
"learning_rate": 1.3834016342941364e-06,
"loss": 0.0052,
"step": 15290
},
{
"epoch": 3.911042944785276,
"grad_norm": 0.2700784504413605,
"learning_rate": 1.3772459798838884e-06,
"loss": 0.022,
"step": 15300
},
{
"epoch": 3.91359918200409,
"grad_norm": 0.010788323357701302,
"learning_rate": 1.3711018630810568e-06,
"loss": 0.0127,
"step": 15310
},
{
"epoch": 3.9161554192229038,
"grad_norm": 0.17254537343978882,
"learning_rate": 1.3649693034531908e-06,
"loss": 0.0026,
"step": 15320
},
{
"epoch": 3.918711656441718,
"grad_norm": 2.0272927284240723,
"learning_rate": 1.3588483205310238e-06,
"loss": 0.0028,
"step": 15330
},
{
"epoch": 3.9212678936605316,
"grad_norm": 0.7689258456230164,
"learning_rate": 1.352738933808434e-06,
"loss": 0.0046,
"step": 15340
},
{
"epoch": 3.9238241308793453,
"grad_norm": 0.09393978118896484,
"learning_rate": 1.3466411627423553e-06,
"loss": 0.0058,
"step": 15350
},
{
"epoch": 3.9263803680981595,
"grad_norm": 0.02193518355488777,
"learning_rate": 1.3405550267527373e-06,
"loss": 0.0118,
"step": 15360
},
{
"epoch": 3.9289366053169736,
"grad_norm": 1.4280931949615479,
"learning_rate": 1.3344805452224668e-06,
"loss": 0.0055,
"step": 15370
},
{
"epoch": 3.9314928425357873,
"grad_norm": 0.017598293721675873,
"learning_rate": 1.3284177374973252e-06,
"loss": 0.0001,
"step": 15380
},
{
"epoch": 3.934049079754601,
"grad_norm": 0.017776915803551674,
"learning_rate": 1.3223666228859034e-06,
"loss": 0.0089,
"step": 15390
},
{
"epoch": 3.936605316973415,
"grad_norm": 0.025338156148791313,
"learning_rate": 1.3163272206595607e-06,
"loss": 0.0101,
"step": 15400
},
{
"epoch": 3.939161554192229,
"grad_norm": 0.00857964251190424,
"learning_rate": 1.3102995500523513e-06,
"loss": 0.0002,
"step": 15410
},
{
"epoch": 3.941717791411043,
"grad_norm": 0.17898601293563843,
"learning_rate": 1.3042836302609707e-06,
"loss": 0.0083,
"step": 15420
},
{
"epoch": 3.9442740286298568,
"grad_norm": 0.01416697259992361,
"learning_rate": 1.2982794804446858e-06,
"loss": 0.0031,
"step": 15430
},
{
"epoch": 3.946830265848671,
"grad_norm": 0.03067069500684738,
"learning_rate": 1.2922871197252818e-06,
"loss": 0.0027,
"step": 15440
},
{
"epoch": 3.9493865030674846,
"grad_norm": 0.013419978320598602,
"learning_rate": 1.2863065671869995e-06,
"loss": 0.0004,
"step": 15450
},
{
"epoch": 3.9519427402862988,
"grad_norm": 0.36794596910476685,
"learning_rate": 1.2803378418764728e-06,
"loss": 0.0034,
"step": 15460
},
{
"epoch": 3.9544989775051125,
"grad_norm": 0.014534058049321175,
"learning_rate": 1.274380962802666e-06,
"loss": 0.0006,
"step": 15470
},
{
"epoch": 3.957055214723926,
"grad_norm": 0.953043520450592,
"learning_rate": 1.2684359489368186e-06,
"loss": 0.0097,
"step": 15480
},
{
"epoch": 3.9596114519427403,
"grad_norm": 0.0640961155295372,
"learning_rate": 1.2625028192123822e-06,
"loss": 0.0076,
"step": 15490
},
{
"epoch": 3.962167689161554,
"grad_norm": 0.026453586295247078,
"learning_rate": 1.2565815925249613e-06,
"loss": 0.0042,
"step": 15500
},
{
"epoch": 3.964723926380368,
"grad_norm": 0.02020988054573536,
"learning_rate": 1.250672287732247e-06,
"loss": 0.0005,
"step": 15510
},
{
"epoch": 3.967280163599182,
"grad_norm": 0.8880366683006287,
"learning_rate": 1.2447749236539674e-06,
"loss": 0.0122,
"step": 15520
},
{
"epoch": 3.969836400817996,
"grad_norm": 0.06537387520074844,
"learning_rate": 1.2388895190718209e-06,
"loss": 0.0043,
"step": 15530
},
{
"epoch": 3.9723926380368098,
"grad_norm": 0.03674660250544548,
"learning_rate": 1.2330160927294178e-06,
"loss": 0.001,
"step": 15540
},
{
"epoch": 3.974948875255624,
"grad_norm": 0.06352321058511734,
"learning_rate": 1.2271546633322157e-06,
"loss": 0.0007,
"step": 15550
},
{
"epoch": 3.9775051124744376,
"grad_norm": 2.053643226623535,
"learning_rate": 1.2213052495474759e-06,
"loss": 0.0064,
"step": 15560
},
{
"epoch": 3.9800613496932513,
"grad_norm": 0.006071037612855434,
"learning_rate": 1.2154678700041805e-06,
"loss": 0.0061,
"step": 15570
},
{
"epoch": 3.9826175869120655,
"grad_norm": 0.032529015094041824,
"learning_rate": 1.2096425432929943e-06,
"loss": 0.0042,
"step": 15580
},
{
"epoch": 3.9851738241308796,
"grad_norm": 0.14356601238250732,
"learning_rate": 1.2038292879661896e-06,
"loss": 0.0025,
"step": 15590
},
{
"epoch": 3.9877300613496933,
"grad_norm": 0.009755146689713001,
"learning_rate": 1.1980281225376029e-06,
"loss": 0.0123,
"step": 15600
},
{
"epoch": 3.990286298568507,
"grad_norm": 0.026369577273726463,
"learning_rate": 1.1922390654825582e-06,
"loss": 0.002,
"step": 15610
},
{
"epoch": 3.992842535787321,
"grad_norm": 1.3809363842010498,
"learning_rate": 1.186462135237823e-06,
"loss": 0.0037,
"step": 15620
},
{
"epoch": 3.995398773006135,
"grad_norm": 0.06871844828128815,
"learning_rate": 1.1806973502015423e-06,
"loss": 0.0076,
"step": 15630
},
{
"epoch": 3.997955010224949,
"grad_norm": 0.01024967897683382,
"learning_rate": 1.1749447287331805e-06,
"loss": 0.0064,
"step": 15640
},
{
"epoch": 4.000511247443763,
"grad_norm": 0.010401812382042408,
"learning_rate": 1.1692042891534677e-06,
"loss": 0.0001,
"step": 15650
},
{
"epoch": 4.0030674846625764,
"grad_norm": 0.019328856840729713,
"learning_rate": 1.1634760497443308e-06,
"loss": 0.0011,
"step": 15660
},
{
"epoch": 4.00562372188139,
"grad_norm": 2.2071306705474854,
"learning_rate": 1.1577600287488472e-06,
"loss": 0.0046,
"step": 15670
},
{
"epoch": 4.008179959100205,
"grad_norm": 0.2718164026737213,
"learning_rate": 1.1520562443711813e-06,
"loss": 0.0002,
"step": 15680
},
{
"epoch": 4.0107361963190185,
"grad_norm": 0.0036849735770374537,
"learning_rate": 1.1463647147765262e-06,
"loss": 0.0024,
"step": 15690
},
{
"epoch": 4.013292433537832,
"grad_norm": 0.026393355801701546,
"learning_rate": 1.1406854580910426e-06,
"loss": 0.0003,
"step": 15700
},
{
"epoch": 4.015848670756646,
"grad_norm": 0.02888057939708233,
"learning_rate": 1.1350184924018137e-06,
"loss": 0.001,
"step": 15710
},
{
"epoch": 4.0184049079754605,
"grad_norm": 4.927444934844971,
"learning_rate": 1.1293638357567692e-06,
"loss": 0.0046,
"step": 15720
},
{
"epoch": 4.020961145194274,
"grad_norm": 1.5200058221817017,
"learning_rate": 1.1237215061646446e-06,
"loss": 0.0009,
"step": 15730
},
{
"epoch": 4.023517382413088,
"grad_norm": 0.006760958582162857,
"learning_rate": 1.118091521594909e-06,
"loss": 0.0001,
"step": 15740
},
{
"epoch": 4.026073619631902,
"grad_norm": 0.0747433677315712,
"learning_rate": 1.1124738999777268e-06,
"loss": 0.0004,
"step": 15750
},
{
"epoch": 4.028629856850716,
"grad_norm": 0.018624255433678627,
"learning_rate": 1.1068686592038786e-06,
"loss": 0.0011,
"step": 15760
},
{
"epoch": 4.03118609406953,
"grad_norm": 0.018480489030480385,
"learning_rate": 1.10127581712472e-06,
"loss": 0.0029,
"step": 15770
},
{
"epoch": 4.033742331288344,
"grad_norm": 0.08727142959833145,
"learning_rate": 1.0956953915521196e-06,
"loss": 0.0015,
"step": 15780
},
{
"epoch": 4.036298568507157,
"grad_norm": 0.17428268492221832,
"learning_rate": 1.0901274002584029e-06,
"loss": 0.0003,
"step": 15790
},
{
"epoch": 4.038854805725971,
"grad_norm": 0.006692373659461737,
"learning_rate": 1.0845718609762912e-06,
"loss": 0.0016,
"step": 15800
},
{
"epoch": 4.041411042944786,
"grad_norm": 0.03485719487071037,
"learning_rate": 1.0790287913988533e-06,
"loss": 0.0028,
"step": 15810
},
{
"epoch": 4.043967280163599,
"grad_norm": 0.1434144675731659,
"learning_rate": 1.0734982091794439e-06,
"loss": 0.0014,
"step": 15820
},
{
"epoch": 4.046523517382413,
"grad_norm": 0.025571011006832123,
"learning_rate": 1.067980131931649e-06,
"loss": 0.0043,
"step": 15830
},
{
"epoch": 4.049079754601227,
"grad_norm": 0.022263115271925926,
"learning_rate": 1.0624745772292262e-06,
"loss": 0.0001,
"step": 15840
},
{
"epoch": 4.051635991820041,
"grad_norm": 0.026498448103666306,
"learning_rate": 1.0569815626060553e-06,
"loss": 0.0014,
"step": 15850
},
{
"epoch": 4.054192229038855,
"grad_norm": 0.11276555806398392,
"learning_rate": 1.051501105556077e-06,
"loss": 0.0012,
"step": 15860
},
{
"epoch": 4.056748466257669,
"grad_norm": 0.03225693851709366,
"learning_rate": 1.0460332235332421e-06,
"loss": 0.0019,
"step": 15870
},
{
"epoch": 4.059304703476482,
"grad_norm": 0.6012237071990967,
"learning_rate": 1.0405779339514466e-06,
"loss": 0.0028,
"step": 15880
},
{
"epoch": 4.061860940695296,
"grad_norm": 1.399053931236267,
"learning_rate": 1.0351352541844895e-06,
"loss": 0.0005,
"step": 15890
},
{
"epoch": 4.064417177914111,
"grad_norm": 0.004602417815476656,
"learning_rate": 1.0297052015660065e-06,
"loss": 0.0018,
"step": 15900
},
{
"epoch": 4.066973415132924,
"grad_norm": 0.011124187149107456,
"learning_rate": 1.0242877933894212e-06,
"loss": 0.0026,
"step": 15910
},
{
"epoch": 4.069529652351738,
"grad_norm": 0.012430194765329361,
"learning_rate": 1.0188830469078832e-06,
"loss": 0.0008,
"step": 15920
},
{
"epoch": 4.072085889570552,
"grad_norm": 0.00355120119638741,
"learning_rate": 1.0134909793342251e-06,
"loss": 0.0014,
"step": 15930
},
{
"epoch": 4.074642126789366,
"grad_norm": 0.010579611174762249,
"learning_rate": 1.0081116078408932e-06,
"loss": 0.0002,
"step": 15940
},
{
"epoch": 4.07719836400818,
"grad_norm": 0.13667239248752594,
"learning_rate": 1.0027449495599045e-06,
"loss": 0.0002,
"step": 15950
},
{
"epoch": 4.079754601226994,
"grad_norm": 0.013385191559791565,
"learning_rate": 9.97391021582782e-07,
"loss": 0.0018,
"step": 15960
},
{
"epoch": 4.0823108384458076,
"grad_norm": 0.09518828243017197,
"learning_rate": 9.92049840960514e-07,
"loss": 0.002,
"step": 15970
},
{
"epoch": 4.084867075664621,
"grad_norm": 0.017801359295845032,
"learning_rate": 9.86721424703483e-07,
"loss": 0.0005,
"step": 15980
},
{
"epoch": 4.087423312883436,
"grad_norm": 0.021596604958176613,
"learning_rate": 9.81405789781425e-07,
"loss": 0.0011,
"step": 15990
},
{
"epoch": 4.08997955010225,
"grad_norm": 2.4400172233581543,
"learning_rate": 9.76102953123369e-07,
"loss": 0.0041,
"step": 16000
},
{
"epoch": 4.092535787321063,
"grad_norm": 0.07604683190584183,
"learning_rate": 9.708129316175875e-07,
"loss": 0.0009,
"step": 16010
},
{
"epoch": 4.095092024539877,
"grad_norm": 0.00839927326887846,
"learning_rate": 9.655357421115324e-07,
"loss": 0.0001,
"step": 16020
},
{
"epoch": 4.097648261758692,
"grad_norm": 0.8300763964653015,
"learning_rate": 9.60271401411797e-07,
"loss": 0.0009,
"step": 16030
},
{
"epoch": 4.100204498977505,
"grad_norm": 0.036536745727062225,
"learning_rate": 9.550199262840494e-07,
"loss": 0.0004,
"step": 16040
},
{
"epoch": 4.102760736196319,
"grad_norm": 0.013892588205635548,
"learning_rate": 9.49781333452987e-07,
"loss": 0.0009,
"step": 16050
},
{
"epoch": 4.105316973415133,
"grad_norm": 0.27383795380592346,
"learning_rate": 9.445556396022754e-07,
"loss": 0.0005,
"step": 16060
},
{
"epoch": 4.107873210633947,
"grad_norm": 0.009578673169016838,
"learning_rate": 9.393428613745036e-07,
"loss": 0.0036,
"step": 16070
},
{
"epoch": 4.110429447852761,
"grad_norm": 0.42609789967536926,
"learning_rate": 9.341430153711306e-07,
"loss": 0.0049,
"step": 16080
},
{
"epoch": 4.112985685071575,
"grad_norm": 0.12703081965446472,
"learning_rate": 9.289561181524214e-07,
"loss": 0.0037,
"step": 16090
},
{
"epoch": 4.115541922290388,
"grad_norm": 0.13383671641349792,
"learning_rate": 9.237821862374092e-07,
"loss": 0.0022,
"step": 16100
},
{
"epoch": 4.118098159509202,
"grad_norm": 0.011773956939578056,
"learning_rate": 9.186212361038288e-07,
"loss": 0.0002,
"step": 16110
},
{
"epoch": 4.120654396728017,
"grad_norm": 2.616377115249634,
"learning_rate": 9.134732841880811e-07,
"loss": 0.003,
"step": 16120
},
{
"epoch": 4.12321063394683,
"grad_norm": 0.012035293504595757,
"learning_rate": 9.083383468851609e-07,
"loss": 0.0079,
"step": 16130
},
{
"epoch": 4.125766871165644,
"grad_norm": 0.2209741622209549,
"learning_rate": 9.032164405486193e-07,
"loss": 0.0047,
"step": 16140
},
{
"epoch": 4.128323108384458,
"grad_norm": 0.00564198475331068,
"learning_rate": 8.981075814905077e-07,
"loss": 0.0009,
"step": 16150
},
{
"epoch": 4.130879345603272,
"grad_norm": 0.021742451936006546,
"learning_rate": 8.930117859813236e-07,
"loss": 0.0009,
"step": 16160
},
{
"epoch": 4.133435582822086,
"grad_norm": 0.011637063696980476,
"learning_rate": 8.879290702499576e-07,
"loss": 0.0025,
"step": 16170
},
{
"epoch": 4.1359918200409,
"grad_norm": 0.44723692536354065,
"learning_rate": 8.828594504836491e-07,
"loss": 0.0012,
"step": 16180
},
{
"epoch": 4.1385480572597135,
"grad_norm": 2.4436564445495605,
"learning_rate": 8.778029428279278e-07,
"loss": 0.0014,
"step": 16190
},
{
"epoch": 4.141104294478527,
"grad_norm": 0.3361468017101288,
"learning_rate": 8.727595633865643e-07,
"loss": 0.0013,
"step": 16200
},
{
"epoch": 4.143660531697342,
"grad_norm": 1.743153691291809,
"learning_rate": 8.677293282215182e-07,
"loss": 0.0022,
"step": 16210
},
{
"epoch": 4.1462167689161555,
"grad_norm": 0.13101418316364288,
"learning_rate": 8.627122533528892e-07,
"loss": 0.0001,
"step": 16220
},
{
"epoch": 4.148773006134969,
"grad_norm": 0.00849368516355753,
"learning_rate": 8.577083547588638e-07,
"loss": 0.0001,
"step": 16230
},
{
"epoch": 4.151329243353783,
"grad_norm": 0.005825154948979616,
"learning_rate": 8.527176483756671e-07,
"loss": 0.0004,
"step": 16240
},
{
"epoch": 4.1538854805725975,
"grad_norm": 0.012522836215794086,
"learning_rate": 8.477401500975063e-07,
"loss": 0.0005,
"step": 16250
},
{
"epoch": 4.156441717791411,
"grad_norm": 0.1203024610877037,
"learning_rate": 8.427758757765264e-07,
"loss": 0.0029,
"step": 16260
},
{
"epoch": 4.158997955010225,
"grad_norm": 0.046782489866018295,
"learning_rate": 8.378248412227574e-07,
"loss": 0.0016,
"step": 16270
},
{
"epoch": 4.161554192229039,
"grad_norm": 0.02540050819516182,
"learning_rate": 8.328870622040652e-07,
"loss": 0.0001,
"step": 16280
},
{
"epoch": 4.164110429447852,
"grad_norm": 0.00631357915699482,
"learning_rate": 8.27962554446094e-07,
"loss": 0.0001,
"step": 16290
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.02206520363688469,
"learning_rate": 8.23051333632231e-07,
"loss": 0.0001,
"step": 16300
},
{
"epoch": 4.169222903885481,
"grad_norm": 0.02339177392423153,
"learning_rate": 8.181534154035398e-07,
"loss": 0.0012,
"step": 16310
},
{
"epoch": 4.171779141104294,
"grad_norm": 0.11059535294771194,
"learning_rate": 8.132688153587237e-07,
"loss": 0.0002,
"step": 16320
},
{
"epoch": 4.174335378323108,
"grad_norm": 0.04154384881258011,
"learning_rate": 8.083975490540658e-07,
"loss": 0.0003,
"step": 16330
},
{
"epoch": 4.176891615541923,
"grad_norm": 0.004592495039105415,
"learning_rate": 8.035396320033911e-07,
"loss": 0.0022,
"step": 16340
},
{
"epoch": 4.179447852760736,
"grad_norm": 0.005622932221740484,
"learning_rate": 7.98695079678004e-07,
"loss": 0.0001,
"step": 16350
},
{
"epoch": 4.18200408997955,
"grad_norm": 0.008403644897043705,
"learning_rate": 7.93863907506649e-07,
"loss": 0.0007,
"step": 16360
},
{
"epoch": 4.184560327198364,
"grad_norm": 0.04854018986225128,
"learning_rate": 7.890461308754565e-07,
"loss": 0.0025,
"step": 16370
},
{
"epoch": 4.1871165644171775,
"grad_norm": 0.007120284251868725,
"learning_rate": 7.842417651278978e-07,
"loss": 0.0041,
"step": 16380
},
{
"epoch": 4.189672801635992,
"grad_norm": 0.005977618508040905,
"learning_rate": 7.794508255647293e-07,
"loss": 0.0005,
"step": 16390
},
{
"epoch": 4.192229038854806,
"grad_norm": 0.01604490913450718,
"learning_rate": 7.746733274439517e-07,
"loss": 0.0005,
"step": 16400
},
{
"epoch": 4.1947852760736195,
"grad_norm": 0.04814684018492699,
"learning_rate": 7.699092859807566e-07,
"loss": 0.0006,
"step": 16410
},
{
"epoch": 4.197341513292433,
"grad_norm": 0.9865232706069946,
"learning_rate": 7.651587163474822e-07,
"loss": 0.0002,
"step": 16420
},
{
"epoch": 4.199897750511248,
"grad_norm": 0.014311658218502998,
"learning_rate": 7.604216336735554e-07,
"loss": 0.0016,
"step": 16430
},
{
"epoch": 4.2024539877300615,
"grad_norm": 0.07794589549303055,
"learning_rate": 7.556980530454571e-07,
"loss": 0.001,
"step": 16440
},
{
"epoch": 4.205010224948875,
"grad_norm": 0.1298278272151947,
"learning_rate": 7.509879895066652e-07,
"loss": 0.0025,
"step": 16450
},
{
"epoch": 4.207566462167689,
"grad_norm": 0.026249248534440994,
"learning_rate": 7.462914580576081e-07,
"loss": 0.0028,
"step": 16460
},
{
"epoch": 4.210122699386503,
"grad_norm": 0.008666482754051685,
"learning_rate": 7.416084736556173e-07,
"loss": 0.0007,
"step": 16470
},
{
"epoch": 4.212678936605317,
"grad_norm": 0.37366658449172974,
"learning_rate": 7.369390512148816e-07,
"loss": 0.0008,
"step": 16480
},
{
"epoch": 4.215235173824131,
"grad_norm": 0.007286503445357084,
"learning_rate": 7.322832056063978e-07,
"loss": 0.0003,
"step": 16490
},
{
"epoch": 4.217791411042945,
"grad_norm": 1.1950106620788574,
"learning_rate": 7.276409516579252e-07,
"loss": 0.0024,
"step": 16500
},
{
"epoch": 4.220347648261758,
"grad_norm": 0.008293128572404385,
"learning_rate": 7.23012304153931e-07,
"loss": 0.0001,
"step": 16510
},
{
"epoch": 4.222903885480573,
"grad_norm": 0.009917296469211578,
"learning_rate": 7.183972778355586e-07,
"loss": 0.0006,
"step": 16520
},
{
"epoch": 4.225460122699387,
"grad_norm": 0.013085747137665749,
"learning_rate": 7.137958874005629e-07,
"loss": 0.003,
"step": 16530
},
{
"epoch": 4.2280163599182,
"grad_norm": 0.09627839922904968,
"learning_rate": 7.092081475032753e-07,
"loss": 0.0006,
"step": 16540
},
{
"epoch": 4.230572597137014,
"grad_norm": 0.007571856956928968,
"learning_rate": 7.046340727545531e-07,
"loss": 0.0001,
"step": 16550
},
{
"epoch": 4.233128834355828,
"grad_norm": 0.005919609218835831,
"learning_rate": 7.000736777217332e-07,
"loss": 0.0015,
"step": 16560
},
{
"epoch": 4.235685071574642,
"grad_norm": 0.016153009608387947,
"learning_rate": 6.955269769285877e-07,
"loss": 0.0001,
"step": 16570
},
{
"epoch": 4.238241308793456,
"grad_norm": 0.016689471900463104,
"learning_rate": 6.909939848552722e-07,
"loss": 0.0015,
"step": 16580
},
{
"epoch": 4.24079754601227,
"grad_norm": 0.012151209637522697,
"learning_rate": 6.864747159382851e-07,
"loss": 0.0056,
"step": 16590
},
{
"epoch": 4.2433537832310835,
"grad_norm": 0.006638688966631889,
"learning_rate": 6.819691845704207e-07,
"loss": 0.002,
"step": 16600
},
{
"epoch": 4.245910020449898,
"grad_norm": 0.026410236954689026,
"learning_rate": 6.774774051007227e-07,
"loss": 0.0006,
"step": 16610
},
{
"epoch": 4.248466257668712,
"grad_norm": 0.03302263841032982,
"learning_rate": 6.729993918344347e-07,
"loss": 0.0001,
"step": 16620
},
{
"epoch": 4.2510224948875255,
"grad_norm": 0.010984467342495918,
"learning_rate": 6.685351590329625e-07,
"loss": 0.0033,
"step": 16630
},
{
"epoch": 4.253578732106339,
"grad_norm": 0.01048219483345747,
"learning_rate": 6.640847209138224e-07,
"loss": 0.0024,
"step": 16640
},
{
"epoch": 4.256134969325154,
"grad_norm": 0.005691261030733585,
"learning_rate": 6.596480916505993e-07,
"loss": 0.0034,
"step": 16650
},
{
"epoch": 4.2586912065439675,
"grad_norm": 0.018030589446425438,
"learning_rate": 6.552252853728958e-07,
"loss": 0.0003,
"step": 16660
},
{
"epoch": 4.261247443762781,
"grad_norm": 0.5344707369804382,
"learning_rate": 6.508163161662994e-07,
"loss": 0.001,
"step": 16670
},
{
"epoch": 4.263803680981595,
"grad_norm": 0.01154622994363308,
"learning_rate": 6.464211980723223e-07,
"loss": 0.0011,
"step": 16680
},
{
"epoch": 4.266359918200409,
"grad_norm": 0.014204679057002068,
"learning_rate": 6.42039945088369e-07,
"loss": 0.0008,
"step": 16690
},
{
"epoch": 4.268916155419223,
"grad_norm": 0.17623376846313477,
"learning_rate": 6.376725711676829e-07,
"loss": 0.0004,
"step": 16700
},
{
"epoch": 4.271472392638037,
"grad_norm": 0.15494291484355927,
"learning_rate": 6.33319090219311e-07,
"loss": 0.0002,
"step": 16710
},
{
"epoch": 4.274028629856851,
"grad_norm": 0.007995120249688625,
"learning_rate": 6.289795161080492e-07,
"loss": 0.0005,
"step": 16720
},
{
"epoch": 4.276584867075664,
"grad_norm": 0.006873907521367073,
"learning_rate": 6.246538626544074e-07,
"loss": 0.0021,
"step": 16730
},
{
"epoch": 4.279141104294479,
"grad_norm": 0.8828302621841431,
"learning_rate": 6.203421436345597e-07,
"loss": 0.0017,
"step": 16740
},
{
"epoch": 4.281697341513293,
"grad_norm": 0.027829930186271667,
"learning_rate": 6.160443727803034e-07,
"loss": 0.0083,
"step": 16750
},
{
"epoch": 4.284253578732106,
"grad_norm": 0.010724814608693123,
"learning_rate": 6.11760563779012e-07,
"loss": 0.001,
"step": 16760
},
{
"epoch": 4.28680981595092,
"grad_norm": 0.005555745679885149,
"learning_rate": 6.07490730273596e-07,
"loss": 0.0013,
"step": 16770
},
{
"epoch": 4.289366053169734,
"grad_norm": 0.5492291450500488,
"learning_rate": 6.03234885862457e-07,
"loss": 0.0018,
"step": 16780
},
{
"epoch": 4.291922290388548,
"grad_norm": 0.01208993699401617,
"learning_rate": 5.989930440994451e-07,
"loss": 0.001,
"step": 16790
},
{
"epoch": 4.294478527607362,
"grad_norm": 0.06638287752866745,
"learning_rate": 5.947652184938124e-07,
"loss": 0.0055,
"step": 16800
},
{
"epoch": 4.297034764826176,
"grad_norm": 0.021167244762182236,
"learning_rate": 5.905514225101761e-07,
"loss": 0.0027,
"step": 16810
},
{
"epoch": 4.2995910020449895,
"grad_norm": 0.01520733255892992,
"learning_rate": 5.863516695684713e-07,
"loss": 0.0002,
"step": 16820
},
{
"epoch": 4.302147239263804,
"grad_norm": 0.41557273268699646,
"learning_rate": 5.8216597304391e-07,
"loss": 0.0003,
"step": 16830
},
{
"epoch": 4.304703476482618,
"grad_norm": 0.0344698503613472,
"learning_rate": 5.779943462669357e-07,
"loss": 0.0006,
"step": 16840
},
{
"epoch": 4.3072597137014315,
"grad_norm": 0.5560601353645325,
"learning_rate": 5.738368025231856e-07,
"loss": 0.0053,
"step": 16850
},
{
"epoch": 4.309815950920245,
"grad_norm": 0.014782343059778214,
"learning_rate": 5.696933550534445e-07,
"loss": 0.0003,
"step": 16860
},
{
"epoch": 4.31237218813906,
"grad_norm": 0.018144864588975906,
"learning_rate": 5.655640170536053e-07,
"loss": 0.0014,
"step": 16870
},
{
"epoch": 4.3149284253578735,
"grad_norm": 0.0022675390355288982,
"learning_rate": 5.614488016746216e-07,
"loss": 0.0007,
"step": 16880
},
{
"epoch": 4.317484662576687,
"grad_norm": 0.22662724554538727,
"learning_rate": 5.573477220224777e-07,
"loss": 0.0006,
"step": 16890
},
{
"epoch": 4.320040899795501,
"grad_norm": 0.011389585211873055,
"learning_rate": 5.532607911581294e-07,
"loss": 0.0022,
"step": 16900
},
{
"epoch": 4.322597137014315,
"grad_norm": 0.06751693785190582,
"learning_rate": 5.491880220974799e-07,
"loss": 0.0005,
"step": 16910
},
{
"epoch": 4.325153374233129,
"grad_norm": 0.006349935662001371,
"learning_rate": 5.451294278113234e-07,
"loss": 0.0002,
"step": 16920
},
{
"epoch": 4.327709611451943,
"grad_norm": 0.013773099519312382,
"learning_rate": 5.410850212253193e-07,
"loss": 0.0001,
"step": 16930
},
{
"epoch": 4.330265848670757,
"grad_norm": 0.010721610859036446,
"learning_rate": 5.37054815219934e-07,
"loss": 0.0001,
"step": 16940
},
{
"epoch": 4.33282208588957,
"grad_norm": 0.3423142433166504,
"learning_rate": 5.330388226304145e-07,
"loss": 0.0005,
"step": 16950
},
{
"epoch": 4.335378323108385,
"grad_norm": 1.396784782409668,
"learning_rate": 5.290370562467378e-07,
"loss": 0.0034,
"step": 16960
},
{
"epoch": 4.337934560327199,
"grad_norm": 0.018410976976156235,
"learning_rate": 5.250495288135776e-07,
"loss": 0.001,
"step": 16970
},
{
"epoch": 4.340490797546012,
"grad_norm": 0.0063552772626280785,
"learning_rate": 5.210762530302554e-07,
"loss": 0.0005,
"step": 16980
},
{
"epoch": 4.343047034764826,
"grad_norm": 0.010990941897034645,
"learning_rate": 5.17117241550707e-07,
"loss": 0.0022,
"step": 16990
},
{
"epoch": 4.34560327198364,
"grad_norm": 0.009601407684385777,
"learning_rate": 5.131725069834403e-07,
"loss": 0.0004,
"step": 17000
},
{
"epoch": 4.348159509202454,
"grad_norm": 0.010526538826525211,
"learning_rate": 5.092420618914934e-07,
"loss": 0.0007,
"step": 17010
},
{
"epoch": 4.350715746421268,
"grad_norm": 0.02961459383368492,
"learning_rate": 5.053259187923981e-07,
"loss": 0.0008,
"step": 17020
},
{
"epoch": 4.353271983640082,
"grad_norm": 0.9297602772712708,
"learning_rate": 5.01424090158133e-07,
"loss": 0.0008,
"step": 17030
},
{
"epoch": 4.355828220858895,
"grad_norm": 0.12763309478759766,
"learning_rate": 4.975365884150951e-07,
"loss": 0.0002,
"step": 17040
},
{
"epoch": 4.35838445807771,
"grad_norm": 0.005043504294008017,
"learning_rate": 4.93663425944047e-07,
"loss": 0.001,
"step": 17050
},
{
"epoch": 4.360940695296524,
"grad_norm": 0.7221952676773071,
"learning_rate": 4.8980461508009e-07,
"loss": 0.0012,
"step": 17060
},
{
"epoch": 4.363496932515337,
"grad_norm": 0.00512282457202673,
"learning_rate": 4.85960168112613e-07,
"loss": 0.0041,
"step": 17070
},
{
"epoch": 4.366053169734151,
"grad_norm": 0.27002206444740295,
"learning_rate": 4.821300972852666e-07,
"loss": 0.002,
"step": 17080
},
{
"epoch": 4.368609406952965,
"grad_norm": 0.0055974265560507774,
"learning_rate": 4.783144147959096e-07,
"loss": 0.0001,
"step": 17090
},
{
"epoch": 4.371165644171779,
"grad_norm": 0.011708883568644524,
"learning_rate": 4.745131327965818e-07,
"loss": 0.0006,
"step": 17100
},
{
"epoch": 4.373721881390593,
"grad_norm": 0.47823408246040344,
"learning_rate": 4.7072626339345896e-07,
"loss": 0.0006,
"step": 17110
},
{
"epoch": 4.376278118609407,
"grad_norm": 0.01948222517967224,
"learning_rate": 4.669538186468192e-07,
"loss": 0.0007,
"step": 17120
},
{
"epoch": 4.378834355828221,
"grad_norm": 0.006773567758500576,
"learning_rate": 4.6319581057099604e-07,
"loss": 0.0009,
"step": 17130
},
{
"epoch": 4.381390593047035,
"grad_norm": 0.009596975520253181,
"learning_rate": 4.5945225113435024e-07,
"loss": 0.0005,
"step": 17140
},
{
"epoch": 4.383946830265849,
"grad_norm": 0.007052087225019932,
"learning_rate": 4.557231522592254e-07,
"loss": 0.0102,
"step": 17150
},
{
"epoch": 4.386503067484663,
"grad_norm": 0.00172753247898072,
"learning_rate": 4.520085258219131e-07,
"loss": 0.0011,
"step": 17160
},
{
"epoch": 4.389059304703476,
"grad_norm": 0.0074590700678527355,
"learning_rate": 4.4830838365261086e-07,
"loss": 0.0003,
"step": 17170
},
{
"epoch": 4.39161554192229,
"grad_norm": 0.011804984882473946,
"learning_rate": 4.446227375353895e-07,
"loss": 0.0001,
"step": 17180
},
{
"epoch": 4.394171779141105,
"grad_norm": 0.011131849139928818,
"learning_rate": 4.4095159920815254e-07,
"loss": 0.0064,
"step": 17190
},
{
"epoch": 4.396728016359918,
"grad_norm": 0.01048702746629715,
"learning_rate": 4.3729498036260144e-07,
"loss": 0.0001,
"step": 17200
},
{
"epoch": 4.399284253578732,
"grad_norm": 0.005339341703802347,
"learning_rate": 4.336528926441924e-07,
"loss": 0.0013,
"step": 17210
},
{
"epoch": 4.401840490797546,
"grad_norm": 0.005323043093085289,
"learning_rate": 4.300253476521077e-07,
"loss": 0.0006,
"step": 17220
},
{
"epoch": 4.40439672801636,
"grad_norm": 4.231276512145996,
"learning_rate": 4.2641235693921257e-07,
"loss": 0.0021,
"step": 17230
},
{
"epoch": 4.406952965235174,
"grad_norm": 0.007030295208096504,
"learning_rate": 4.228139320120211e-07,
"loss": 0.0004,
"step": 17240
},
{
"epoch": 4.409509202453988,
"grad_norm": 0.0034739875700324774,
"learning_rate": 4.1923008433065627e-07,
"loss": 0.0014,
"step": 17250
},
{
"epoch": 4.412065439672801,
"grad_norm": 0.008167327381670475,
"learning_rate": 4.1566082530882126e-07,
"loss": 0.0004,
"step": 17260
},
{
"epoch": 4.414621676891615,
"grad_norm": 0.008502046577632427,
"learning_rate": 4.1210616631375267e-07,
"loss": 0.0003,
"step": 17270
},
{
"epoch": 4.41717791411043,
"grad_norm": 0.011800073087215424,
"learning_rate": 4.085661186661921e-07,
"loss": 0.0001,
"step": 17280
},
{
"epoch": 4.419734151329243,
"grad_norm": 0.013263700529932976,
"learning_rate": 4.050406936403456e-07,
"loss": 0.0001,
"step": 17290
},
{
"epoch": 4.422290388548057,
"grad_norm": 2.6378958225250244,
"learning_rate": 4.015299024638536e-07,
"loss": 0.001,
"step": 17300
},
{
"epoch": 4.424846625766871,
"grad_norm": 0.012993947602808475,
"learning_rate": 3.9803375631774555e-07,
"loss": 0.0008,
"step": 17310
},
{
"epoch": 4.427402862985685,
"grad_norm": 0.01075220387428999,
"learning_rate": 3.945522663364154e-07,
"loss": 0.0001,
"step": 17320
},
{
"epoch": 4.429959100204499,
"grad_norm": 0.2742706537246704,
"learning_rate": 3.910854436075767e-07,
"loss": 0.0002,
"step": 17330
},
{
"epoch": 4.432515337423313,
"grad_norm": 0.007065648213028908,
"learning_rate": 3.876332991722348e-07,
"loss": 0.0016,
"step": 17340
},
{
"epoch": 4.4350715746421265,
"grad_norm": 0.020136894658207893,
"learning_rate": 3.84195844024644e-07,
"loss": 0.0005,
"step": 17350
},
{
"epoch": 4.43762781186094,
"grad_norm": 0.027455536648631096,
"learning_rate": 3.8077308911227964e-07,
"loss": 0.0006,
"step": 17360
},
{
"epoch": 4.440184049079755,
"grad_norm": 0.05177016928792,
"learning_rate": 3.773650453358008e-07,
"loss": 0.0005,
"step": 17370
},
{
"epoch": 4.4427402862985685,
"grad_norm": 1.8424170017242432,
"learning_rate": 3.739717235490137e-07,
"loss": 0.0013,
"step": 17380
},
{
"epoch": 4.445296523517382,
"grad_norm": 0.1712723970413208,
"learning_rate": 3.705931345588376e-07,
"loss": 0.0003,
"step": 17390
},
{
"epoch": 4.447852760736196,
"grad_norm": 0.09310045093297958,
"learning_rate": 3.672292891252732e-07,
"loss": 0.0001,
"step": 17400
},
{
"epoch": 4.4504089979550105,
"grad_norm": 0.0639791414141655,
"learning_rate": 3.6388019796136654e-07,
"loss": 0.001,
"step": 17410
},
{
"epoch": 4.452965235173824,
"grad_norm": 0.0685553252696991,
"learning_rate": 3.605458717331739e-07,
"loss": 0.0006,
"step": 17420
},
{
"epoch": 4.455521472392638,
"grad_norm": 0.034271907061338425,
"learning_rate": 3.5722632105972765e-07,
"loss": 0.0007,
"step": 17430
},
{
"epoch": 4.458077709611452,
"grad_norm": 0.04286443442106247,
"learning_rate": 3.539215565130055e-07,
"loss": 0.0007,
"step": 17440
},
{
"epoch": 4.460633946830266,
"grad_norm": 0.02046520821750164,
"learning_rate": 3.506315886178957e-07,
"loss": 0.0006,
"step": 17450
},
{
"epoch": 4.46319018404908,
"grad_norm": 0.012680341489613056,
"learning_rate": 3.4735642785215963e-07,
"loss": 0.0041,
"step": 17460
},
{
"epoch": 4.465746421267894,
"grad_norm": 0.005123642738908529,
"learning_rate": 3.4409608464640366e-07,
"loss": 0.0006,
"step": 17470
},
{
"epoch": 4.468302658486707,
"grad_norm": 0.003161899745464325,
"learning_rate": 3.4085056938404303e-07,
"loss": 0.0009,
"step": 17480
},
{
"epoch": 4.470858895705521,
"grad_norm": 0.009017124772071838,
"learning_rate": 3.376198924012708e-07,
"loss": 0.001,
"step": 17490
},
{
"epoch": 4.473415132924336,
"grad_norm": 0.04075018689036369,
"learning_rate": 3.3440406398702055e-07,
"loss": 0.0024,
"step": 17500
},
{
"epoch": 4.475971370143149,
"grad_norm": 3.3212058544158936,
"learning_rate": 3.3120309438293973e-07,
"loss": 0.0038,
"step": 17510
},
{
"epoch": 4.478527607361963,
"grad_norm": 0.15001584589481354,
"learning_rate": 3.2801699378335274e-07,
"loss": 0.0001,
"step": 17520
},
{
"epoch": 4.481083844580777,
"grad_norm": 0.054374609142541885,
"learning_rate": 3.248457723352316e-07,
"loss": 0.0008,
"step": 17530
},
{
"epoch": 4.483640081799591,
"grad_norm": 0.01903144083917141,
"learning_rate": 3.2168944013815764e-07,
"loss": 0.0002,
"step": 17540
},
{
"epoch": 4.486196319018405,
"grad_norm": 0.012091502547264099,
"learning_rate": 3.1854800724429703e-07,
"loss": 0.0003,
"step": 17550
},
{
"epoch": 4.488752556237219,
"grad_norm": 0.0041188085451722145,
"learning_rate": 3.1542148365836465e-07,
"loss": 0.0001,
"step": 17560
},
{
"epoch": 4.4913087934560325,
"grad_norm": 0.004006090573966503,
"learning_rate": 3.123098793375928e-07,
"loss": 0.0055,
"step": 17570
},
{
"epoch": 4.493865030674846,
"grad_norm": 4.543797016143799,
"learning_rate": 3.092132041916979e-07,
"loss": 0.0013,
"step": 17580
},
{
"epoch": 4.496421267893661,
"grad_norm": 0.006038820371031761,
"learning_rate": 3.06131468082852e-07,
"loss": 0.0002,
"step": 17590
},
{
"epoch": 4.4989775051124745,
"grad_norm": 0.06375231593847275,
"learning_rate": 3.0306468082564933e-07,
"loss": 0.0013,
"step": 17600
},
{
"epoch": 4.501533742331288,
"grad_norm": 0.010167334228754044,
"learning_rate": 3.000128521870771e-07,
"loss": 0.0001,
"step": 17610
},
{
"epoch": 4.504089979550102,
"grad_norm": 0.3248971700668335,
"learning_rate": 2.969759918864784e-07,
"loss": 0.0001,
"step": 17620
},
{
"epoch": 4.5066462167689165,
"grad_norm": 0.0035574256908148527,
"learning_rate": 2.939541095955334e-07,
"loss": 0.0016,
"step": 17630
},
{
"epoch": 4.50920245398773,
"grad_norm": 0.0564039871096611,
"learning_rate": 2.9094721493821255e-07,
"loss": 0.0009,
"step": 17640
},
{
"epoch": 4.511758691206544,
"grad_norm": 0.0021962756291031837,
"learning_rate": 2.8795531749076067e-07,
"loss": 0.0008,
"step": 17650
},
{
"epoch": 4.514314928425358,
"grad_norm": 0.00536281056702137,
"learning_rate": 2.8497842678165467e-07,
"loss": 0.0006,
"step": 17660
},
{
"epoch": 4.516871165644172,
"grad_norm": 0.007071573752909899,
"learning_rate": 2.8201655229158465e-07,
"loss": 0.0002,
"step": 17670
},
{
"epoch": 4.519427402862986,
"grad_norm": 6.006162643432617,
"learning_rate": 2.7906970345341177e-07,
"loss": 0.0027,
"step": 17680
},
{
"epoch": 4.5219836400818,
"grad_norm": 0.013232548721134663,
"learning_rate": 2.761378896521477e-07,
"loss": 0.0009,
"step": 17690
},
{
"epoch": 4.524539877300613,
"grad_norm": 0.007870076224207878,
"learning_rate": 2.732211202249202e-07,
"loss": 0.0004,
"step": 17700
},
{
"epoch": 4.527096114519427,
"grad_norm": 0.001714581623673439,
"learning_rate": 2.7031940446094475e-07,
"loss": 0.0005,
"step": 17710
},
{
"epoch": 4.529652351738242,
"grad_norm": 0.7307052612304688,
"learning_rate": 2.674327516014924e-07,
"loss": 0.0004,
"step": 17720
},
{
"epoch": 4.532208588957055,
"grad_norm": 0.15873846411705017,
"learning_rate": 2.6456117083986487e-07,
"loss": 0.0001,
"step": 17730
},
{
"epoch": 4.534764826175869,
"grad_norm": 0.02356737293303013,
"learning_rate": 2.617046713213617e-07,
"loss": 0.0005,
"step": 17740
},
{
"epoch": 4.537321063394683,
"grad_norm": 0.005058961920440197,
"learning_rate": 2.5886326214325297e-07,
"loss": 0.0061,
"step": 17750
},
{
"epoch": 4.539877300613497,
"grad_norm": 0.007151829544454813,
"learning_rate": 2.560369523547485e-07,
"loss": 0.0011,
"step": 17760
},
{
"epoch": 4.542433537832311,
"grad_norm": 0.008786034770309925,
"learning_rate": 2.5322575095697077e-07,
"loss": 0.0012,
"step": 17770
},
{
"epoch": 4.544989775051125,
"grad_norm": 0.01927710324525833,
"learning_rate": 2.50429666902926e-07,
"loss": 0.0001,
"step": 17780
},
{
"epoch": 4.5475460122699385,
"grad_norm": 0.008998622186481953,
"learning_rate": 2.476487090974755e-07,
"loss": 0.0002,
"step": 17790
},
{
"epoch": 4.550102249488752,
"grad_norm": 0.007529801689088345,
"learning_rate": 2.448828863973052e-07,
"loss": 0.0006,
"step": 17800
},
{
"epoch": 4.552658486707567,
"grad_norm": 0.039857879281044006,
"learning_rate": 2.4213220761090173e-07,
"loss": 0.0069,
"step": 17810
},
{
"epoch": 4.5552147239263805,
"grad_norm": 0.008589456789195538,
"learning_rate": 2.3939668149852046e-07,
"loss": 0.0001,
"step": 17820
},
{
"epoch": 4.557770961145194,
"grad_norm": 0.005125945899635553,
"learning_rate": 2.366763167721603e-07,
"loss": 0.0003,
"step": 17830
},
{
"epoch": 4.560327198364008,
"grad_norm": 0.004016405437141657,
"learning_rate": 2.3397112209553207e-07,
"loss": 0.0002,
"step": 17840
},
{
"epoch": 4.5628834355828225,
"grad_norm": 0.08685509860515594,
"learning_rate": 2.312811060840381e-07,
"loss": 0.0026,
"step": 17850
},
{
"epoch": 4.565439672801636,
"grad_norm": 0.0353722870349884,
"learning_rate": 2.286062773047354e-07,
"loss": 0.0001,
"step": 17860
},
{
"epoch": 4.56799591002045,
"grad_norm": 0.021159430965781212,
"learning_rate": 2.2594664427631807e-07,
"loss": 0.0001,
"step": 17870
},
{
"epoch": 4.570552147239264,
"grad_norm": 0.02464616298675537,
"learning_rate": 2.2330221546908005e-07,
"loss": 0.0002,
"step": 17880
},
{
"epoch": 4.573108384458077,
"grad_norm": 0.042522210627794266,
"learning_rate": 2.2067299930489838e-07,
"loss": 0.0002,
"step": 17890
},
{
"epoch": 4.575664621676892,
"grad_norm": 0.03682945668697357,
"learning_rate": 2.180590041571995e-07,
"loss": 0.0001,
"step": 17900
},
{
"epoch": 4.578220858895706,
"grad_norm": 0.006650723051279783,
"learning_rate": 2.15460238350933e-07,
"loss": 0.0001,
"step": 17910
},
{
"epoch": 4.580777096114519,
"grad_norm": 0.008757648058235645,
"learning_rate": 2.1287671016254897e-07,
"loss": 0.0002,
"step": 17920
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.17380449175834656,
"learning_rate": 2.1030842781996796e-07,
"loss": 0.0002,
"step": 17930
},
{
"epoch": 4.585889570552148,
"grad_norm": 0.010563348419964314,
"learning_rate": 2.0775539950255774e-07,
"loss": 0.0002,
"step": 17940
},
{
"epoch": 4.588445807770961,
"grad_norm": 0.1902349591255188,
"learning_rate": 2.0521763334110324e-07,
"loss": 0.0002,
"step": 17950
},
{
"epoch": 4.591002044989775,
"grad_norm": 0.012331271544098854,
"learning_rate": 2.0269513741778492e-07,
"loss": 0.0002,
"step": 17960
},
{
"epoch": 4.593558282208589,
"grad_norm": 0.03246452286839485,
"learning_rate": 2.0018791976615048e-07,
"loss": 0.0001,
"step": 17970
},
{
"epoch": 4.5961145194274025,
"grad_norm": 0.03161914646625519,
"learning_rate": 1.9769598837109105e-07,
"loss": 0.0051,
"step": 17980
},
{
"epoch": 4.598670756646217,
"grad_norm": 0.0297340489923954,
"learning_rate": 1.9521935116881107e-07,
"loss": 0.0002,
"step": 17990
},
{
"epoch": 4.601226993865031,
"grad_norm": 3.1284525394439697,
"learning_rate": 1.9275801604681232e-07,
"loss": 0.0012,
"step": 18000
},
{
"epoch": 4.6037832310838445,
"grad_norm": 0.005135530140250921,
"learning_rate": 1.9031199084385833e-07,
"loss": 0.0006,
"step": 18010
},
{
"epoch": 4.606339468302658,
"grad_norm": 0.005707759875804186,
"learning_rate": 1.8788128334995715e-07,
"loss": 0.0001,
"step": 18020
},
{
"epoch": 4.608895705521473,
"grad_norm": 0.008373766206204891,
"learning_rate": 1.8546590130633035e-07,
"loss": 0.0011,
"step": 18030
},
{
"epoch": 4.6114519427402865,
"grad_norm": 0.013387088663876057,
"learning_rate": 1.8306585240539576e-07,
"loss": 0.0,
"step": 18040
},
{
"epoch": 4.6140081799591,
"grad_norm": 0.0016770199872553349,
"learning_rate": 1.8068114429073524e-07,
"loss": 0.0002,
"step": 18050
},
{
"epoch": 4.616564417177914,
"grad_norm": 0.005193398799747229,
"learning_rate": 1.7831178455707533e-07,
"loss": 0.0001,
"step": 18060
},
{
"epoch": 4.619120654396728,
"grad_norm": 0.008146319538354874,
"learning_rate": 1.759577807502627e-07,
"loss": 0.001,
"step": 18070
},
{
"epoch": 4.621676891615542,
"grad_norm": 0.00837623793631792,
"learning_rate": 1.736191403672377e-07,
"loss": 0.0028,
"step": 18080
},
{
"epoch": 4.624233128834356,
"grad_norm": 0.0052582272328436375,
"learning_rate": 1.7129587085601084e-07,
"loss": 0.0031,
"step": 18090
},
{
"epoch": 4.62678936605317,
"grad_norm": 0.004001881927251816,
"learning_rate": 1.689879796156424e-07,
"loss": 0.0001,
"step": 18100
},
{
"epoch": 4.629345603271983,
"grad_norm": 0.00547180837020278,
"learning_rate": 1.6669547399621567e-07,
"loss": 0.0012,
"step": 18110
},
{
"epoch": 4.631901840490798,
"grad_norm": 0.01456737145781517,
"learning_rate": 1.6441836129881427e-07,
"loss": 0.0002,
"step": 18120
},
{
"epoch": 4.634458077709612,
"grad_norm": 0.02086499147117138,
"learning_rate": 1.6215664877549774e-07,
"loss": 0.0005,
"step": 18130
},
{
"epoch": 4.637014314928425,
"grad_norm": 0.012769919820129871,
"learning_rate": 1.5991034362928204e-07,
"loss": 0.0002,
"step": 18140
},
{
"epoch": 4.639570552147239,
"grad_norm": 0.18312056362628937,
"learning_rate": 1.576794530141129e-07,
"loss": 0.0016,
"step": 18150
},
{
"epoch": 4.642126789366053,
"grad_norm": 0.0038857313338667154,
"learning_rate": 1.5546398403484542e-07,
"loss": 0.0028,
"step": 18160
},
{
"epoch": 4.644683026584867,
"grad_norm": 0.011599598452448845,
"learning_rate": 1.5326394374721887e-07,
"loss": 0.0001,
"step": 18170
},
{
"epoch": 4.647239263803681,
"grad_norm": 0.01293268147855997,
"learning_rate": 1.5107933915783745e-07,
"loss": 0.0008,
"step": 18180
},
{
"epoch": 4.649795501022495,
"grad_norm": 0.012520798482000828,
"learning_rate": 1.4891017722414525e-07,
"loss": 0.0012,
"step": 18190
},
{
"epoch": 4.652351738241308,
"grad_norm": 1.7857009172439575,
"learning_rate": 1.467564648544062e-07,
"loss": 0.0014,
"step": 18200
},
{
"epoch": 4.654907975460123,
"grad_norm": 0.0051713059656322,
"learning_rate": 1.4461820890767976e-07,
"loss": 0.0001,
"step": 18210
},
{
"epoch": 4.657464212678937,
"grad_norm": 0.27911797165870667,
"learning_rate": 1.424954161938019e-07,
"loss": 0.0002,
"step": 18220
},
{
"epoch": 4.66002044989775,
"grad_norm": 0.030297599732875824,
"learning_rate": 1.4038809347336036e-07,
"loss": 0.0001,
"step": 18230
},
{
"epoch": 4.662576687116564,
"grad_norm": 0.008832601830363274,
"learning_rate": 1.38296247457676e-07,
"loss": 0.0002,
"step": 18240
},
{
"epoch": 4.665132924335378,
"grad_norm": 0.015586239285767078,
"learning_rate": 1.3621988480877812e-07,
"loss": 0.0001,
"step": 18250
},
{
"epoch": 4.6676891615541924,
"grad_norm": 0.010234987363219261,
"learning_rate": 1.341590121393882e-07,
"loss": 0.0001,
"step": 18260
},
{
"epoch": 4.670245398773006,
"grad_norm": 0.0027397891972213984,
"learning_rate": 1.3211363601289273e-07,
"loss": 0.0033,
"step": 18270
},
{
"epoch": 4.67280163599182,
"grad_norm": 0.009892730042338371,
"learning_rate": 1.3008376294332715e-07,
"loss": 0.0001,
"step": 18280
},
{
"epoch": 4.675357873210634,
"grad_norm": 0.03709021210670471,
"learning_rate": 1.2806939939535358e-07,
"loss": 0.0001,
"step": 18290
},
{
"epoch": 4.677914110429448,
"grad_norm": 0.006908297538757324,
"learning_rate": 1.2607055178423978e-07,
"loss": 0.0001,
"step": 18300
},
{
"epoch": 4.680470347648262,
"grad_norm": 0.009108340367674828,
"learning_rate": 1.2408722647583692e-07,
"loss": 0.0001,
"step": 18310
},
{
"epoch": 4.683026584867076,
"grad_norm": 0.009007609449326992,
"learning_rate": 1.221194297865641e-07,
"loss": 0.0006,
"step": 18320
},
{
"epoch": 4.685582822085889,
"grad_norm": 0.16736240684986115,
"learning_rate": 1.2016716798338436e-07,
"loss": 0.0004,
"step": 18330
},
{
"epoch": 4.688139059304703,
"grad_norm": 0.015963025391101837,
"learning_rate": 1.182304472837853e-07,
"loss": 0.0001,
"step": 18340
},
{
"epoch": 4.690695296523518,
"grad_norm": 2.035236358642578,
"learning_rate": 1.1630927385576196e-07,
"loss": 0.0034,
"step": 18350
},
{
"epoch": 4.693251533742331,
"grad_norm": 0.017824502661824226,
"learning_rate": 1.1440365381779117e-07,
"loss": 0.0003,
"step": 18360
},
{
"epoch": 4.695807770961145,
"grad_norm": 0.021831089630723,
"learning_rate": 1.1251359323881994e-07,
"loss": 0.0004,
"step": 18370
},
{
"epoch": 4.69836400817996,
"grad_norm": 0.016453437507152557,
"learning_rate": 1.1063909813823992e-07,
"loss": 0.0002,
"step": 18380
},
{
"epoch": 4.700920245398773,
"grad_norm": 0.0010980580700561404,
"learning_rate": 1.0878017448587075e-07,
"loss": 0.0007,
"step": 18390
},
{
"epoch": 4.703476482617587,
"grad_norm": 0.015402301214635372,
"learning_rate": 1.0693682820194062e-07,
"loss": 0.0001,
"step": 18400
},
{
"epoch": 4.706032719836401,
"grad_norm": 0.00750606507062912,
"learning_rate": 1.0510906515706798e-07,
"loss": 0.0001,
"step": 18410
},
{
"epoch": 4.708588957055214,
"grad_norm": 0.007333151530474424,
"learning_rate": 1.0329689117224262e-07,
"loss": 0.0001,
"step": 18420
},
{
"epoch": 4.711145194274029,
"grad_norm": 0.043747998774051666,
"learning_rate": 1.0150031201880573e-07,
"loss": 0.0017,
"step": 18430
},
{
"epoch": 4.713701431492843,
"grad_norm": 0.038498375564813614,
"learning_rate": 9.97193334184332e-08,
"loss": 0.0005,
"step": 18440
},
{
"epoch": 4.716257668711656,
"grad_norm": 0.01316425297409296,
"learning_rate": 9.79539610431185e-08,
"loss": 0.0001,
"step": 18450
},
{
"epoch": 4.71881390593047,
"grad_norm": 0.014060962945222855,
"learning_rate": 9.620420051514978e-08,
"loss": 0.0002,
"step": 18460
},
{
"epoch": 4.721370143149285,
"grad_norm": 0.014505197294056416,
"learning_rate": 9.44700574070978e-08,
"loss": 0.002,
"step": 18470
},
{
"epoch": 4.723926380368098,
"grad_norm": 0.006656293291598558,
"learning_rate": 9.275153724179475e-08,
"loss": 0.001,
"step": 18480
},
{
"epoch": 4.726482617586912,
"grad_norm": 0.009168506599962711,
"learning_rate": 9.104864549231706e-08,
"loss": 0.0005,
"step": 18490
},
{
"epoch": 4.729038854805726,
"grad_norm": 0.016905134543776512,
"learning_rate": 8.936138758196933e-08,
"loss": 0.0001,
"step": 18500
},
{
"epoch": 4.7315950920245395,
"grad_norm": 0.012130284681916237,
"learning_rate": 8.768976888426484e-08,
"loss": 0.0001,
"step": 18510
},
{
"epoch": 4.734151329243354,
"grad_norm": 0.08311706781387329,
"learning_rate": 8.603379472291118e-08,
"loss": 0.0006,
"step": 18520
},
{
"epoch": 4.736707566462168,
"grad_norm": 0.006343462970107794,
"learning_rate": 8.43934703717908e-08,
"loss": 0.0001,
"step": 18530
},
{
"epoch": 4.7392638036809815,
"grad_norm": 0.0163714736700058,
"learning_rate": 8.27688010549449e-08,
"loss": 0.0001,
"step": 18540
},
{
"epoch": 4.741820040899795,
"grad_norm": 0.004221266135573387,
"learning_rate": 8.115979194655843e-08,
"loss": 0.0006,
"step": 18550
},
{
"epoch": 4.74437627811861,
"grad_norm": 0.21704137325286865,
"learning_rate": 7.956644817094072e-08,
"loss": 0.0005,
"step": 18560
},
{
"epoch": 4.7469325153374236,
"grad_norm": 0.010948584415018559,
"learning_rate": 7.798877480251321e-08,
"loss": 0.0002,
"step": 18570
},
{
"epoch": 4.749488752556237,
"grad_norm": 0.004772350657731295,
"learning_rate": 7.642677686578726e-08,
"loss": 0.0001,
"step": 18580
},
{
"epoch": 4.752044989775051,
"grad_norm": 0.005361333955079317,
"learning_rate": 7.488045933535582e-08,
"loss": 0.0004,
"step": 18590
},
{
"epoch": 4.754601226993865,
"grad_norm": 0.014715801924467087,
"learning_rate": 7.334982713586958e-08,
"loss": 0.0013,
"step": 18600
},
{
"epoch": 4.757157464212679,
"grad_norm": 0.16467002034187317,
"learning_rate": 7.183488514202863e-08,
"loss": 0.0002,
"step": 18610
},
{
"epoch": 4.759713701431493,
"grad_norm": 0.01957176998257637,
"learning_rate": 7.03356381785597e-08,
"loss": 0.0001,
"step": 18620
},
{
"epoch": 4.762269938650307,
"grad_norm": 1.6649690866470337,
"learning_rate": 6.885209102020896e-08,
"loss": 0.0014,
"step": 18630
},
{
"epoch": 4.76482617586912,
"grad_norm": 0.3953768014907837,
"learning_rate": 6.73842483917192e-08,
"loss": 0.0001,
"step": 18640
},
{
"epoch": 4.767382413087935,
"grad_norm": 0.14624808728694916,
"learning_rate": 6.593211496781881e-08,
"loss": 0.0004,
"step": 18650
},
{
"epoch": 4.769938650306749,
"grad_norm": 0.1692701280117035,
"learning_rate": 6.449569537320677e-08,
"loss": 0.0002,
"step": 18660
},
{
"epoch": 4.772494887525562,
"grad_norm": 0.009253126569092274,
"learning_rate": 6.307499418253705e-08,
"loss": 0.0001,
"step": 18670
},
{
"epoch": 4.775051124744376,
"grad_norm": 0.011899994686245918,
"learning_rate": 6.167001592040367e-08,
"loss": 0.0001,
"step": 18680
},
{
"epoch": 4.77760736196319,
"grad_norm": 0.0060684094205498695,
"learning_rate": 6.028076506132741e-08,
"loss": 0.0014,
"step": 18690
},
{
"epoch": 4.780163599182004,
"grad_norm": 0.004709139931946993,
"learning_rate": 5.890724602974074e-08,
"loss": 0.0001,
"step": 18700
},
{
"epoch": 4.782719836400818,
"grad_norm": 0.01010716613382101,
"learning_rate": 5.7549463199974566e-08,
"loss": 0.0001,
"step": 18710
},
{
"epoch": 4.785276073619632,
"grad_norm": 0.016905710101127625,
"learning_rate": 5.6207420896242646e-08,
"loss": 0.0001,
"step": 18720
},
{
"epoch": 4.7878323108384455,
"grad_norm": 0.016229376196861267,
"learning_rate": 5.488112339263052e-08,
"loss": 0.0002,
"step": 18730
},
{
"epoch": 4.79038854805726,
"grad_norm": 0.05223598703742027,
"learning_rate": 5.3570574913078264e-08,
"loss": 0.0006,
"step": 18740
},
{
"epoch": 4.792944785276074,
"grad_norm": 2.0258493423461914,
"learning_rate": 5.2275779631371646e-08,
"loss": 0.0006,
"step": 18750
},
{
"epoch": 4.7955010224948875,
"grad_norm": 3.3475382328033447,
"learning_rate": 5.0996741671123226e-08,
"loss": 0.0017,
"step": 18760
},
{
"epoch": 4.798057259713701,
"grad_norm": 0.05064383149147034,
"learning_rate": 4.97334651057646e-08,
"loss": 0.0011,
"step": 18770
},
{
"epoch": 4.800613496932515,
"grad_norm": 0.023241933435201645,
"learning_rate": 4.8485953958530286e-08,
"loss": 0.0001,
"step": 18780
},
{
"epoch": 4.8031697341513295,
"grad_norm": 0.016580374911427498,
"learning_rate": 4.725421220244553e-08,
"loss": 0.0005,
"step": 18790
},
{
"epoch": 4.805725971370143,
"grad_norm": 0.029072437435388565,
"learning_rate": 4.603824376031407e-08,
"loss": 0.0009,
"step": 18800
},
{
"epoch": 4.808282208588957,
"grad_norm": 0.012765723280608654,
"learning_rate": 4.4838052504705406e-08,
"loss": 0.0001,
"step": 18810
},
{
"epoch": 4.810838445807771,
"grad_norm": 0.10864470154047012,
"learning_rate": 4.3653642257943105e-08,
"loss": 0.0001,
"step": 18820
},
{
"epoch": 4.813394683026585,
"grad_norm": 0.876848578453064,
"learning_rate": 4.248501679208983e-08,
"loss": 0.0015,
"step": 18830
},
{
"epoch": 4.815950920245399,
"grad_norm": 2.7491190433502197,
"learning_rate": 4.133217982894011e-08,
"loss": 0.0034,
"step": 18840
},
{
"epoch": 4.818507157464213,
"grad_norm": 0.014472966082394123,
"learning_rate": 4.019513504000372e-08,
"loss": 0.002,
"step": 18850
},
{
"epoch": 4.821063394683026,
"grad_norm": 2.2643988132476807,
"learning_rate": 3.907388604649842e-08,
"loss": 0.0031,
"step": 18860
},
{
"epoch": 4.82361963190184,
"grad_norm": 1.9290136098861694,
"learning_rate": 3.796843641933334e-08,
"loss": 0.0006,
"step": 18870
},
{
"epoch": 4.826175869120655,
"grad_norm": 0.0031455198768526316,
"learning_rate": 3.687878967910285e-08,
"loss": 0.0023,
"step": 18880
},
{
"epoch": 4.828732106339468,
"grad_norm": 0.07761465013027191,
"learning_rate": 3.580494929607159e-08,
"loss": 0.0017,
"step": 18890
},
{
"epoch": 4.831288343558282,
"grad_norm": 0.007843953557312489,
"learning_rate": 3.4746918690165e-08,
"loss": 0.0001,
"step": 18900
},
{
"epoch": 4.833844580777096,
"grad_norm": 0.012562461197376251,
"learning_rate": 3.370470123095826e-08,
"loss": 0.0,
"step": 18910
},
{
"epoch": 4.83640081799591,
"grad_norm": 0.030938010662794113,
"learning_rate": 3.267830023766516e-08,
"loss": 0.0003,
"step": 18920
},
{
"epoch": 4.838957055214724,
"grad_norm": 0.011501450091600418,
"learning_rate": 3.166771897912868e-08,
"loss": 0.0,
"step": 18930
},
{
"epoch": 4.841513292433538,
"grad_norm": 0.020580632612109184,
"learning_rate": 3.0672960673808205e-08,
"loss": 0.0001,
"step": 18940
},
{
"epoch": 4.8440695296523515,
"grad_norm": 0.9514909386634827,
"learning_rate": 2.969402848977232e-08,
"loss": 0.0004,
"step": 18950
},
{
"epoch": 4.846625766871165,
"grad_norm": 0.01408900786191225,
"learning_rate": 2.873092554468604e-08,
"loss": 0.0017,
"step": 18960
},
{
"epoch": 4.84918200408998,
"grad_norm": 0.007879039272665977,
"learning_rate": 2.7783654905803036e-08,
"loss": 0.0007,
"step": 18970
},
{
"epoch": 4.8517382413087935,
"grad_norm": 0.00814911350607872,
"learning_rate": 2.6852219589953986e-08,
"loss": 0.0012,
"step": 18980
},
{
"epoch": 4.854294478527607,
"grad_norm": 0.006910000462085009,
"learning_rate": 2.5936622563537685e-08,
"loss": 0.0,
"step": 18990
},
{
"epoch": 4.856850715746421,
"grad_norm": 0.01161187607795,
"learning_rate": 2.503686674251382e-08,
"loss": 0.0002,
"step": 19000
},
{
"epoch": 4.8594069529652355,
"grad_norm": 0.4636043906211853,
"learning_rate": 2.4152954992388565e-08,
"loss": 0.0005,
"step": 19010
},
{
"epoch": 4.861963190184049,
"grad_norm": 0.003940473776310682,
"learning_rate": 2.328489012821067e-08,
"loss": 0.0001,
"step": 19020
},
{
"epoch": 4.864519427402863,
"grad_norm": 0.01835138350725174,
"learning_rate": 2.243267491455925e-08,
"loss": 0.0018,
"step": 19030
},
{
"epoch": 4.867075664621677,
"grad_norm": 0.003659491892904043,
"learning_rate": 2.159631206553714e-08,
"loss": 0.0004,
"step": 19040
},
{
"epoch": 4.86963190184049,
"grad_norm": 0.010196760296821594,
"learning_rate": 2.077580424475978e-08,
"loss": 0.0007,
"step": 19050
},
{
"epoch": 4.872188139059305,
"grad_norm": 0.40100085735321045,
"learning_rate": 1.9971154065349108e-08,
"loss": 0.0008,
"step": 19060
},
{
"epoch": 4.874744376278119,
"grad_norm": 0.0034521687775850296,
"learning_rate": 1.9182364089924134e-08,
"loss": 0.0001,
"step": 19070
},
{
"epoch": 4.877300613496932,
"grad_norm": 0.019500114023685455,
"learning_rate": 1.8409436830593152e-08,
"loss": 0.0002,
"step": 19080
},
{
"epoch": 4.879856850715746,
"grad_norm": 0.006733261980116367,
"learning_rate": 1.765237474894488e-08,
"loss": 0.0046,
"step": 19090
},
{
"epoch": 4.882413087934561,
"grad_norm": 0.02504415065050125,
"learning_rate": 1.691118025604066e-08,
"loss": 0.0001,
"step": 19100
},
{
"epoch": 4.884969325153374,
"grad_norm": 0.017662404105067253,
"learning_rate": 1.618585571240949e-08,
"loss": 0.0006,
"step": 19110
},
{
"epoch": 4.887525562372188,
"grad_norm": 0.005395929794758558,
"learning_rate": 1.5476403428035803e-08,
"loss": 0.0001,
"step": 19120
},
{
"epoch": 4.890081799591002,
"grad_norm": 0.004282295238226652,
"learning_rate": 1.478282566235667e-08,
"loss": 0.0013,
"step": 19130
},
{
"epoch": 4.8926380368098155,
"grad_norm": 0.006876455619931221,
"learning_rate": 1.4105124624251843e-08,
"loss": 0.0001,
"step": 19140
},
{
"epoch": 4.89519427402863,
"grad_norm": 2.063361406326294,
"learning_rate": 1.3443302472036513e-08,
"loss": 0.0009,
"step": 19150
},
{
"epoch": 4.897750511247444,
"grad_norm": 0.34553998708724976,
"learning_rate": 1.279736131345799e-08,
"loss": 0.0013,
"step": 19160
},
{
"epoch": 4.9003067484662575,
"grad_norm": 0.18306072056293488,
"learning_rate": 1.2167303205682934e-08,
"loss": 0.001,
"step": 19170
},
{
"epoch": 4.902862985685072,
"grad_norm": 0.007264157757163048,
"learning_rate": 1.1553130155297908e-08,
"loss": 0.0066,
"step": 19180
},
{
"epoch": 4.905419222903886,
"grad_norm": 0.006425623781979084,
"learning_rate": 1.0954844118296614e-08,
"loss": 0.0032,
"step": 19190
},
{
"epoch": 4.9079754601226995,
"grad_norm": 0.014158538542687893,
"learning_rate": 1.0372447000077113e-08,
"loss": 0.0001,
"step": 19200
},
{
"epoch": 4.910531697341513,
"grad_norm": 0.011595489457249641,
"learning_rate": 9.805940655436274e-09,
"loss": 0.0003,
"step": 19210
},
{
"epoch": 4.913087934560327,
"grad_norm": 3.0229008197784424,
"learning_rate": 9.2553268885609e-09,
"loss": 0.0012,
"step": 19220
},
{
"epoch": 4.9156441717791415,
"grad_norm": 0.048986513167619705,
"learning_rate": 8.720607453024388e-09,
"loss": 0.0009,
"step": 19230
},
{
"epoch": 4.918200408997955,
"grad_norm": 0.010676774196326733,
"learning_rate": 8.20178405178118e-09,
"loss": 0.0008,
"step": 19240
},
{
"epoch": 4.920756646216769,
"grad_norm": 0.006409882567822933,
"learning_rate": 7.698858337159553e-09,
"loss": 0.0005,
"step": 19250
},
{
"epoch": 4.923312883435583,
"grad_norm": 0.019153757020831108,
"learning_rate": 7.2118319108582805e-09,
"loss": 0.0002,
"step": 19260
},
{
"epoch": 4.925869120654397,
"grad_norm": 0.15216705203056335,
"learning_rate": 6.7407063239405264e-09,
"loss": 0.0005,
"step": 19270
},
{
"epoch": 4.928425357873211,
"grad_norm": 0.005260101519525051,
"learning_rate": 6.285483076828858e-09,
"loss": 0.0002,
"step": 19280
},
{
"epoch": 4.930981595092025,
"grad_norm": 0.012515813112258911,
"learning_rate": 5.846163619300238e-09,
"loss": 0.0006,
"step": 19290
},
{
"epoch": 4.933537832310838,
"grad_norm": 0.2751409411430359,
"learning_rate": 5.422749350482148e-09,
"loss": 0.0025,
"step": 19300
},
{
"epoch": 4.936094069529652,
"grad_norm": 0.006292239762842655,
"learning_rate": 5.015241618849254e-09,
"loss": 0.0017,
"step": 19310
},
{
"epoch": 4.938650306748467,
"grad_norm": 0.006389266811311245,
"learning_rate": 4.623641722215077e-09,
"loss": 0.0003,
"step": 19320
},
{
"epoch": 4.94120654396728,
"grad_norm": 0.00943728256970644,
"learning_rate": 4.247950907733112e-09,
"loss": 0.0002,
"step": 19330
},
{
"epoch": 4.943762781186094,
"grad_norm": 0.10176153481006622,
"learning_rate": 3.888170371887934e-09,
"loss": 0.0009,
"step": 19340
},
{
"epoch": 4.946319018404908,
"grad_norm": 0.5887343287467957,
"learning_rate": 3.5443012604957638e-09,
"loss": 0.0003,
"step": 19350
},
{
"epoch": 4.948875255623722,
"grad_norm": 0.013615554198622704,
"learning_rate": 3.2163446686966913e-09,
"loss": 0.0013,
"step": 19360
},
{
"epoch": 4.951431492842536,
"grad_norm": 2.769341230392456,
"learning_rate": 2.9043016409552317e-09,
"loss": 0.0029,
"step": 19370
},
{
"epoch": 4.95398773006135,
"grad_norm": 0.010591404512524605,
"learning_rate": 2.6081731710531076e-09,
"loss": 0.0022,
"step": 19380
},
{
"epoch": 4.956543967280163,
"grad_norm": 0.02332007698714733,
"learning_rate": 2.3279602020892522e-09,
"loss": 0.0001,
"step": 19390
},
{
"epoch": 4.959100204498977,
"grad_norm": 0.5851395130157471,
"learning_rate": 2.06366362647481e-09,
"loss": 0.0002,
"step": 19400
},
{
"epoch": 4.961656441717792,
"grad_norm": 0.0056032175198197365,
"learning_rate": 1.8152842859320286e-09,
"loss": 0.0001,
"step": 19410
},
{
"epoch": 4.9642126789366054,
"grad_norm": 0.011081293225288391,
"learning_rate": 1.5828229714892619e-09,
"loss": 0.0001,
"step": 19420
},
{
"epoch": 4.966768916155419,
"grad_norm": 0.00719296932220459,
"learning_rate": 1.366280423480415e-09,
"loss": 0.0012,
"step": 19430
},
{
"epoch": 4.969325153374233,
"grad_norm": 0.0061414423398673534,
"learning_rate": 1.1656573315421693e-09,
"loss": 0.0002,
"step": 19440
},
{
"epoch": 4.9718813905930475,
"grad_norm": 0.006891654338687658,
"learning_rate": 9.80954334611206e-10,
"loss": 0.0003,
"step": 19450
},
{
"epoch": 4.974437627811861,
"grad_norm": 0.35890260338783264,
"learning_rate": 8.121720209219864e-10,
"loss": 0.0025,
"step": 19460
},
{
"epoch": 4.976993865030675,
"grad_norm": 0.010992944240570068,
"learning_rate": 6.59310928006196e-10,
"loss": 0.0001,
"step": 19470
},
{
"epoch": 4.979550102249489,
"grad_norm": 0.010081345215439796,
"learning_rate": 5.2237154268997e-10,
"loss": 0.0003,
"step": 19480
},
{
"epoch": 4.982106339468302,
"grad_norm": 0.027087528258562088,
"learning_rate": 4.013543010927823e-10,
"loss": 0.0005,
"step": 19490
},
{
"epoch": 4.984662576687117,
"grad_norm": 0.005823803599923849,
"learning_rate": 2.962595886257802e-10,
"loss": 0.0006,
"step": 19500
},
{
"epoch": 4.987218813905931,
"grad_norm": 0.10786204785108566,
"learning_rate": 2.0708773999011945e-10,
"loss": 0.0003,
"step": 19510
},
{
"epoch": 4.989775051124744,
"grad_norm": 0.31824302673339844,
"learning_rate": 1.3383903917696394e-10,
"loss": 0.0009,
"step": 19520
},
{
"epoch": 4.992331288343558,
"grad_norm": 0.01443443726748228,
"learning_rate": 7.651371946637565e-11,
"loss": 0.0014,
"step": 19530
},
{
"epoch": 4.994887525562373,
"grad_norm": 0.014061720110476017,
"learning_rate": 3.511196342509404e-11,
"loss": 0.0015,
"step": 19540
},
{
"epoch": 4.997443762781186,
"grad_norm": 0.10343813896179199,
"learning_rate": 9.633902908201542e-12,
"loss": 0.0003,
"step": 19550
},
{
"epoch": 5.0,
"grad_norm": 0.009711273945868015,
"learning_rate": 7.961905745812459e-14,
"loss": 0.0002,
"step": 19560
},
{
"epoch": 5.0,
"step": 19560,
"total_flos": 1.975874722701312e+17,
"train_loss": 0.2911138574734241,
"train_runtime": 11035.9124,
"train_samples_per_second": 1.772,
"train_steps_per_second": 1.772
}
],
"logging_steps": 10,
"max_steps": 19560,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.975874722701312e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}